mirror of
https://github.com/luau-lang/luau.git
synced 2025-01-05 19:09:11 +00:00
c5089def6e
* Fix a bug where reading a property from an unsealed table caused inference to improperly infer the existence of that property. * Fix #827 We have also made a lot of progress on the new solver and the JIT. Both projects are still in the process of being built out. Neither are ready for general use yet. We are mostly working to tighten up how the new solver handles refinements and updates to unsealed tables to bring it up to the same level as the old solver. --------- Co-authored-by: Arseny Kapoulkine <arseny.kapoulkine@gmail.com> Co-authored-by: Vyacheslav Egorov <vegorov@roblox.com>
458 lines
21 KiB
Python
458 lines
21 KiB
Python
#!/usr/bin/python3
|
|
# This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
|
|
|
|
import argparse
|
|
import asyncio
|
|
import copy
|
|
import json
|
|
import math
|
|
import os
|
|
import platform
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import textwrap
|
|
from enum import Enum
|
|
|
|
def add_parser(subparsers):
|
|
flag_bisect_command = subparsers.add_parser('flag-bisect',
|
|
help=help(),
|
|
description=help(),
|
|
epilog=epilog(),
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
)
|
|
|
|
add_argument_parsers(flag_bisect_command)
|
|
flag_bisect_command.set_defaults(func=flag_bisect_main)
|
|
return flag_bisect_command
|
|
|
|
def help():
|
|
return 'Search for a set of flags triggering the faulty behavior in unit tests'
|
|
|
|
def get_terminal_width():
|
|
try:
|
|
return os.get_terminal_size().columns
|
|
except:
|
|
# Return a reasonable default when a terminal is not available
|
|
return 80
|
|
def wrap_text(text, width):
|
|
leading_whitespace_re = re.compile('( *)')
|
|
|
|
def get_paragraphs_and_indent(string):
|
|
lines = string.split('\n')
|
|
result = ''
|
|
line_count = 0
|
|
initial_indent = ''
|
|
subsequent_indent = ''
|
|
for line in lines:
|
|
if len(line.strip()) == 0:
|
|
if line_count > 0:
|
|
yield result, initial_indent, subsequent_indent
|
|
result = ''
|
|
line_count = 0
|
|
else:
|
|
line_count += 1
|
|
if line_count == 1:
|
|
initial_indent = leading_whitespace_re.match(line).group(1)
|
|
subsequent_indent = initial_indent
|
|
elif line_count == 2:
|
|
subsequent_indent = leading_whitespace_re.match(line).group(1)
|
|
result += line.strip() + '\n'
|
|
|
|
result = ''
|
|
for paragraph, initial_indent, subsequent_indent in get_paragraphs_and_indent(text):
|
|
result += textwrap.fill(paragraph, width=width, initial_indent=initial_indent, subsequent_indent=subsequent_indent, break_on_hyphens=False) + '\n\n'
|
|
return result
|
|
|
|
def wrap_text_for_terminal(text):
|
|
right_margin = 2 # This margin matches what argparse uses when formatting argument documentation
|
|
min_width = 20
|
|
width = max(min_width, get_terminal_width() - right_margin)
|
|
return wrap_text(text, width)
|
|
|
|
def epilog():
|
|
return wrap_text_for_terminal('''
|
|
This tool uses the delta debugging algorithm to minimize the set of flags to the ones that are faulty in your unit tests,
|
|
and the usage is trivial. Just provide a path to the unit test and you're done, the tool will do the rest.
|
|
|
|
There are many use cases with flag-bisect. Included but not limited to:
|
|
|
|
1: If your test is failing when you omit `--fflags=true` but it works when passing `--fflags=true`, then you can
|
|
use this tool to find that set of flag requirements to see which flags are missing that will help to fix it. Ditto
|
|
for the opposite too, this tool is generalized for that case.
|
|
|
|
2: If you happen to run into a problem on production, and you're not sure which flags is the problem and you can easily
|
|
create a unit test, you can run flag-bisect on that unit test to rapidly find the set of flags.
|
|
|
|
3: If you have a flag that causes a performance regression, there's also the `--timeout=N` where `N` is in seconds.
|
|
|
|
4: If you have tests that are demonstrating flakiness behavior, you can also use `--tries=N` where `N` is the number of
|
|
attempts to run the same set of flags before moving on to the new set. This will eventually drill down to the flaky flag(s).
|
|
Generally 8 tries should be more than enough, but it depends on the rarity. The more rare it is, the higher the attempts count
|
|
needs to be. Note that this comes with a performance cost the higher you go, but certainly still faster than manual search.
|
|
This argument will disable parallel mode by default. If this is not desired, explicitly write `--parallel=on`.
|
|
|
|
5: By default flag-bisect runs in parallel mode which uses a slightly modified version of delta debugging algorithm to support
|
|
trying multiple sets of flags concurrently. This means that the number of sets the algorithm will try at once is equal to the
|
|
number of concurrent jobs. There is currently no upper bound to that, so heed this warning that your machine may slow down
|
|
significantly. In this mode, we display the number of jobs it is running in parallel. Use `--parallel=off` to disable parallel
|
|
mode.
|
|
|
|
Be aware that this introduces some level of *non-determinism*, and it is fundamental due to the interaction with flag dependencies
|
|
and the fact one job may finish faster than another job that got ran in the same cycle. However, it generally shouldn't matter
|
|
if your test is deterministic and has no implicit flag dependencies in the codebase.
|
|
|
|
The tool will try to automatically figure out which of `--pass` or `--fail` to use if you omit them or use `--auto` by applying
|
|
heuristics. For example, if the tests works using `--fflags=true` and crashes if omitting `--fflags=true`, then it knows
|
|
to use `--pass` to give you set of flags that will cause that crash. As usual, vice versa is also true. Since this is a
|
|
heuristic, if it gets that guess wrong, you can override with `--pass` or `--fail`.
|
|
|
|
You can speed this process up by scoping it to as few tests as possible, for example if you're using doctest then you'd
|
|
pass `--tc=my_test` as an argument after `--`, so `flag-bisect ./path/to/binary -- --tc=my_test`.
|
|
''')
|
|
|
|
class InterestnessMode(Enum):
|
|
AUTO = 0,
|
|
FAIL = 1,
|
|
PASS = 2,
|
|
|
|
def add_argument_parsers(parser):
|
|
parser.add_argument('binary_path', help='Path to the unit test binary that will be bisected for a set of flags')
|
|
|
|
parser.add_argument('--tries', dest='attempts', type=int, default=1, metavar='N',
|
|
help='If the tests are flaky, flag-bisect will try again with the same set by N amount of times before moving on')
|
|
|
|
parser.add_argument('--parallel', dest='parallel', choices=['on', 'off'], default='default',
|
|
help='Test multiple sets of flags in parallel, useful when the test takes a while to run.')
|
|
|
|
parser.add_argument('--explicit', dest='explicit', action='store_true', default=False, help='Explicitly set flags to false')
|
|
|
|
parser.add_argument('--filter', dest='filter', default=None, help='Regular expression to filter for a subset of flags to test')
|
|
|
|
parser.add_argument('--verbose', dest='verbose', action='store_true', default=False, help='Show stdout and stderr of the program being run')
|
|
|
|
interestness_parser = parser.add_mutually_exclusive_group()
|
|
interestness_parser.add_argument('--auto', dest='mode', action='store_const', const=InterestnessMode.AUTO,
|
|
default=InterestnessMode.AUTO, help='Automatically figure out which one of --pass or --fail should be used')
|
|
interestness_parser.add_argument('--fail', dest='mode', action='store_const', const=InterestnessMode.FAIL,
|
|
help='You want this if omitting --fflags=true causes tests to fail')
|
|
interestness_parser.add_argument('--pass', dest='mode', action='store_const', const=InterestnessMode.PASS,
|
|
help='You want this if passing --fflags=true causes tests to pass')
|
|
interestness_parser.add_argument('--timeout', dest='timeout', type=int, default=0, metavar='SECONDS',
|
|
help='Find the flag(s) causing performance regression if time to run exceeds the timeout in seconds')
|
|
|
|
class Options:
|
|
def __init__(self, args, other_args, sense):
|
|
self.path = args.binary_path
|
|
self.explicit = args.explicit
|
|
self.sense = sense
|
|
self.timeout = args.timeout
|
|
self.interested_in_timeouts = args.timeout != 0
|
|
self.attempts = args.attempts
|
|
self.parallel = (args.parallel == 'on' or args.parallel == 'default') if args.attempts == 1 else args.parallel == 'on'
|
|
self.filter = re.compile(".*" + args.filter + ".*") if args.filter else None
|
|
self.verbose = args.verbose
|
|
self.other_args = [arg for arg in other_args if arg != '--'] # Useless to have -- here, discard.
|
|
|
|
def copy_with_sense(self, sense):
|
|
new_copy = copy.copy(self)
|
|
new_copy.sense = sense
|
|
return new_copy
|
|
|
|
class InterestnessResult(Enum):
|
|
FAIL = 0,
|
|
PASS = 1,
|
|
TIMED_OUT = 2,
|
|
|
|
class Progress:
|
|
def __init__(self, count, n_of_jobs=None):
|
|
self.count = count
|
|
self.steps = 0
|
|
self.n_of_jobs = n_of_jobs
|
|
self.buffer = None
|
|
|
|
def show(self):
|
|
# remaining is actually the height of the current search tree.
|
|
remain = int(math.log2(self.count))
|
|
flag_plural = 'flag' if self.count == 1 else 'flags'
|
|
node_plural = 'node' if remain == 1 else 'nodes'
|
|
jobs_info = f', running {self.n_of_jobs} jobs' if self.n_of_jobs is not None else ''
|
|
return f'flag bisection: testing {self.count} {flag_plural} (step {self.steps}, {remain} {node_plural} remain{jobs_info})'
|
|
|
|
def hide(self):
|
|
if self.buffer:
|
|
sys.stdout.write('\b \b' * len(self.buffer))
|
|
|
|
def update(self, len, n_of_jobs=None):
|
|
self.hide()
|
|
self.count = len
|
|
self.steps += 1
|
|
self.n_of_jobs = n_of_jobs
|
|
self.buffer = self.show()
|
|
sys.stdout.write(self.buffer)
|
|
sys.stdout.flush()
|
|
|
|
def list_fflags(options):
|
|
try:
|
|
out = subprocess.check_output([options.path, '--list-fflags'], encoding='UTF-8')
|
|
flag_names = []
|
|
|
|
# It's unlikely that a program we're going to test has no flags.
|
|
# So if the output doesn't start with FFlag, assume it doesn't support --list-fflags and therefore cannot be bisected.
|
|
if not out.startswith('FFlag') and not out.startswith('DFFlag') and not out.startswith('SFFlag'):
|
|
return None
|
|
|
|
flag_names = out.split('\n')[:-1]
|
|
|
|
subset = [flag for flag in flag_names if options.filter.match(flag) is not None] if options.filter else flag_names
|
|
return subset if subset else None
|
|
except:
|
|
return None
|
|
|
|
def mk_flags_argument(options, flags, initial_flags):
|
|
lst = [flag + '=true' for flag in flags]
|
|
|
|
# When --explicit is provided, we'd like to find the set of flags from initial_flags that's not in active flags.
|
|
# This is so that we can provide a =false value instead of leaving them out to be the default value.
|
|
if options.explicit:
|
|
for flag in initial_flags:
|
|
if flag not in flags:
|
|
lst.append(flag + '=false')
|
|
|
|
return '--fflags=' + ','.join(lst)
|
|
|
|
def mk_command_line(options, flags_argument):
|
|
arguments = [options.path, *options.other_args]
|
|
if flags_argument is not None:
|
|
arguments.append(flags_argument)
|
|
return arguments
|
|
|
|
async def get_interestness(options, flags_argument):
|
|
try:
|
|
timeout = options.timeout if options.interested_in_timeouts else None
|
|
cmd = mk_command_line(options, flags_argument)
|
|
stdout = subprocess.PIPE if not options.verbose else None
|
|
stderr = subprocess.PIPE if not options.verbose else None
|
|
process = subprocess.run(cmd, stdout=stdout, stderr=stderr, timeout=timeout)
|
|
return InterestnessResult.PASS if process.returncode == 0 else InterestnessResult.FAIL
|
|
except subprocess.TimeoutExpired:
|
|
return InterestnessResult.TIMED_OUT
|
|
|
|
async def is_hot(options, flags_argument, pred=any):
|
|
results = await asyncio.gather(*[get_interestness(options, flags_argument) for _ in range(options.attempts)])
|
|
|
|
if options.interested_in_timeouts:
|
|
return pred([InterestnessResult.TIMED_OUT == x for x in results])
|
|
else:
|
|
return pred([(InterestnessResult.PASS if options.sense else InterestnessResult.FAIL) == x for x in results])
|
|
|
|
def pairwise_disjoints(flags, granularity):
|
|
offset = 0
|
|
per_slice_len = len(flags) // granularity
|
|
while offset < len(flags):
|
|
yield flags[offset:offset + per_slice_len]
|
|
offset += per_slice_len
|
|
|
|
def subsets_and_complements(flags, granularity):
|
|
for disjoint_set in pairwise_disjoints(flags, granularity):
|
|
yield disjoint_set, [flag for flag in flags if flag not in disjoint_set]
|
|
|
|
# https://www.cs.purdue.edu/homes/xyzhang/fall07/Papers/delta-debugging.pdf
|
|
async def ddmin(options, initial_flags):
|
|
current = initial_flags
|
|
granularity = 2
|
|
|
|
progress = Progress(len(current))
|
|
progress.update(len(current))
|
|
|
|
while len(current) >= 2:
|
|
changed = False
|
|
|
|
for (subset, complement) in subsets_and_complements(current, granularity):
|
|
progress.update(len(current))
|
|
if await is_hot(options, mk_flags_argument(options, complement, initial_flags)):
|
|
current = complement
|
|
granularity = max(granularity - 1, 2)
|
|
changed = True
|
|
break
|
|
elif await is_hot(options, mk_flags_argument(options, subset, initial_flags)):
|
|
current = subset
|
|
granularity = 2
|
|
changed = True
|
|
break
|
|
|
|
if not changed:
|
|
if granularity == len(current):
|
|
break
|
|
granularity = min(granularity * 2, len(current))
|
|
|
|
progress.hide()
|
|
return current
|
|
|
|
async def ddmin_parallel(options, initial_flags):
|
|
current = initial_flags
|
|
granularity = 2
|
|
|
|
progress = Progress(len(current))
|
|
progress.update(len(current), granularity)
|
|
|
|
while len(current) >= 2:
|
|
changed = False
|
|
|
|
subset_jobs = []
|
|
complement_jobs = []
|
|
|
|
def advance(task):
|
|
nonlocal current
|
|
nonlocal granularity
|
|
nonlocal changed
|
|
# task.cancel() calls the callback passed to add_done_callback...
|
|
if task.cancelled():
|
|
return
|
|
hot, new_delta, new_granularity = task.result()
|
|
if hot and not changed:
|
|
current = new_delta
|
|
granularity = new_granularity
|
|
changed = True
|
|
for job in subset_jobs:
|
|
job.cancel()
|
|
for job in complement_jobs:
|
|
job.cancel()
|
|
|
|
for (subset, complement) in subsets_and_complements(current, granularity):
|
|
async def work(flags, new_granularity):
|
|
hot = await is_hot(options, mk_flags_argument(options, flags, initial_flags))
|
|
return (hot, flags, new_granularity)
|
|
|
|
# We want to run subset jobs in parallel first.
|
|
subset_job = asyncio.create_task(work(subset, 2))
|
|
subset_job.add_done_callback(advance)
|
|
subset_jobs.append(subset_job)
|
|
|
|
# Then the complements afterwards, but only if we didn't find a new subset.
|
|
complement_job = asyncio.create_task(work(complement, max(granularity - 1, 2)))
|
|
complement_job.add_done_callback(advance)
|
|
complement_jobs.append(complement_job)
|
|
|
|
# When we cancel jobs, the asyncio.gather will be waiting pointlessly.
|
|
# In that case, we'd like to return the control to this routine.
|
|
await asyncio.gather(*subset_jobs, return_exceptions=True)
|
|
if not changed:
|
|
await asyncio.gather(*complement_jobs, return_exceptions=True)
|
|
progress.update(len(current), granularity)
|
|
|
|
if not changed:
|
|
if granularity == len(current):
|
|
break
|
|
granularity = min(granularity * 2, len(current))
|
|
|
|
progress.hide()
|
|
return current
|
|
|
|
def search(options, initial_flags):
|
|
if options.parallel:
|
|
return ddmin_parallel(options, initial_flags)
|
|
else:
|
|
return ddmin(options, initial_flags)
|
|
|
|
async def do_work(args, other_args):
|
|
sense = None
|
|
|
|
# If --timeout isn't used, try to apply a heuristic to figure out which of --pass or --fail we want.
|
|
if args.timeout == 0 and args.mode == InterestnessMode.AUTO:
|
|
inner_options = Options(args, other_args, sense)
|
|
|
|
# We aren't interested in timeout for this heuristic. It just makes no sense to assume timeouts.
|
|
# This actually cannot happen by this point, but if we make timeout a non-exclusive switch to --auto, this will go wrong.
|
|
inner_options.timeout = 0
|
|
inner_options.interested_in_timeouts = False
|
|
|
|
all_tasks = asyncio.gather(
|
|
is_hot(inner_options.copy_with_sense(True), '--fflags=true', all),
|
|
is_hot(inner_options.copy_with_sense(False), '--fflags=false' if inner_options.explicit else None, all),
|
|
)
|
|
|
|
# If it times out, we can print a message saying that this is still working. We intentionally want to continue doing work.
|
|
done, pending = await asyncio.wait([all_tasks], timeout=1.5)
|
|
if all_tasks not in done:
|
|
print('Hang on! I\'m running your program to try and figure out which of --pass or --fail to use!')
|
|
print('Need to find out faster? Cancel the work and explicitly write --pass or --fail')
|
|
|
|
is_pass_hot, is_fail_hot = await all_tasks
|
|
|
|
# This is a bit counter-intuitive, but the following table tells us which of the sense we want.
|
|
# Because when you omit --fflags=true argument and it fails, then is_fail_hot is True.
|
|
# Consequently, you need to use --pass to find out what that set of flags is. And vice versa.
|
|
#
|
|
# Also, when is_pass_hot is True and is_fail_hot is False, then that program is working as expected.
|
|
# There should be no reason to run flag bisection.
|
|
# However, this can be ambiguous in the opposite of the aforementioned outcome!
|
|
#
|
|
# is_pass_hot | is_fail_hot | is ambiguous?
|
|
#-------------|-------------|---------------
|
|
# True | True | No! Pick --pass.
|
|
# False | False | No! Pick --fail.
|
|
# True | False | No! But this is the exact situation where you shouldn't need to flag-bisect. Raise an error.
|
|
# False | True | Yes! But we'll pragmatically pick --fail here in the hope it gives the correct set of flags.
|
|
|
|
if is_pass_hot and not is_fail_hot:
|
|
print('The tests seems to be working fine for me. If you really need to flag-bisect, please try again with an explicit --pass or --fail', file=sys.stderr)
|
|
return 1
|
|
|
|
if not is_pass_hot and is_fail_hot:
|
|
print('I couldn\'t quite figure out which of --pass or --fail to use, but I\'ll carry on anyway')
|
|
|
|
sense = is_pass_hot
|
|
argument = '--pass' if sense else '--fail'
|
|
print(f'I\'m bisecting flags as if {argument} was used')
|
|
else:
|
|
sense = True if args.mode == InterestnessMode.PASS else False
|
|
|
|
options = Options(args, other_args, sense)
|
|
|
|
initial_flags = list_fflags(options)
|
|
if initial_flags is None:
|
|
print('I cannot bisect flags with ' + options.path, file=sys.stderr)
|
|
print('These are required for me to be able to cooperate:', file=sys.stderr)
|
|
print('\t--list-fflags must print a list of flags separated by newlines, including FFlag prefix', file=sys.stderr)
|
|
print('\t--fflags=... to accept a comma-separated pair of flag names and their value in the form FFlagFoo=true', file=sys.stderr)
|
|
return 1
|
|
|
|
# On Windows, there is an upper bound on the numbers of characters for a command line incantation.
|
|
# If we don't handle this ourselves, the runtime error is going to look nothing like the actual problem.
|
|
# It'd say "file name way too long" or something to that effect. We can teed up a better error message and
|
|
# tell the user how to work around it by using --filter.
|
|
if platform.system() == 'Windows':
|
|
cmd_line = ' '.join(mk_command_line(options, mk_flags_argument(options, initial_flags, [])))
|
|
if len(cmd_line) >= 8191:
|
|
print(f'Never mind! The command line is too long because we have {len(initial_flags)} flags to test', file=sys.stderr)
|
|
print('Consider using `--filter=<regex>` to narrow it down upfront, or use any version of WSL instead', file=sys.stderr)
|
|
return 1
|
|
|
|
hot_flags = await search(options, initial_flags)
|
|
if hot_flags:
|
|
print('I narrowed down to these flags:')
|
|
print(textwrap.indent('\n'.join(hot_flags), prefix='\t'))
|
|
|
|
# If we showed the command line in explicit mode, all flags would be listed here.
|
|
# This would pollute the terminal with 3000 flags. We don't want that. Don't show it.
|
|
# Ditto for when the number flags we bisected are equal.
|
|
if not options.explicit and len(hot_flags) != len(initial_flags):
|
|
print('$ ' + ' '.join(mk_command_line(options, mk_flags_argument(options, hot_flags, initial_flags))))
|
|
|
|
return 0
|
|
|
|
print('I found nothing, sorry', file=sys.stderr)
|
|
return 1
|
|
|
|
def flag_bisect_main(args, other_args):
|
|
return asyncio.run(do_work(args, other_args))
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description=help(), epilog=epilog(), formatter_class=argparse.RawTextHelpFormatter)
|
|
add_argument_parsers(parser)
|
|
args, other_args = parser.parse_known_args()
|
|
return flag_bisect_main(args, other_args)
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main())
|