mirror of
https://github.com/luau-lang/luau.git
synced 2025-01-19 17:28:06 +00:00
2088 lines
70 KiB
Lua
2088 lines
70 KiB
Lua
--[[
|
|
PCRE2-based RegEx implemention for Luau
|
|
Version 1.0.0a2
|
|
BSD 2-Clause Licence
|
|
Copyright © 2020 - Blockzez (devforum /u/Blockzez and github.com/Blockzez)
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright notice, this
|
|
list of conditions and the following disclaimer.
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
|
this list of conditions and the following disclaimer in the documentation
|
|
and/or other materials provided with the distribution.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
]]
|
|
--[[ Settings ]]--
|
|
-- You can change them here
|
|
local options = {
|
|
-- The maximum cache size for regex so the patterns are cached so it doesn't recompile the pattern
|
|
-- The only accepted value are number values >= 0, strings that can be automatically coered to numbers that are >= 0, false and nil
|
|
-- Do note that empty regex patterns (comment-only patterns included) are never cached regardless
|
|
-- The default is 256
|
|
cacheSize = 256,
|
|
|
|
-- A boolean that determines whether this use unicode data
|
|
-- If this value evalulates to false, you can remove _unicodechar_category, _scripts and _xuc safely and it'll now error if:
|
|
-- - You try to compile a RegEx with unicode flag
|
|
-- - You try to use the \p pattern
|
|
-- The default is true
|
|
unicodeData = false,
|
|
};
|
|
|
|
--
|
|
local u_categories = options.unicodeData and require(script:WaitForChild("_unicodechar_category"));
|
|
local chr_scripts = options.unicodeData and require(script:WaitForChild("_scripts"));
|
|
local xuc_chr = options.unicodeData and require(script:WaitForChild("_xuc"));
|
|
local proxy = setmetatable({ }, { __mode = 'k' });
|
|
local re, re_m, match_m = { }, { }, { };
|
|
local lockmsg;
|
|
|
|
--[[ Functions ]]--
|
|
local function to_str_arr(self, init)
|
|
if init then
|
|
self = string.sub(self, utf8.offset(self, init));
|
|
end;
|
|
local len = utf8.len(self);
|
|
if len <= 1999 then
|
|
return { n = len, s = self, utf8.codepoint(self, 1, #self) };
|
|
end;
|
|
local clen = math.ceil(len / 1999);
|
|
local ret = table.create(len);
|
|
local p = 1;
|
|
for i = 1, clen do
|
|
local c = table.pack(utf8.codepoint(self, utf8.offset(self, i * 1999 - 1998), utf8.offset(self, i * 1999 - (i == clen and 1998 - ((len - 1) % 1999 + 1) or - 1)) - 1));
|
|
table.move(c, 1, c.n, p, ret);
|
|
p += c.n;
|
|
end;
|
|
ret.s, ret.n = self, len;
|
|
return ret;
|
|
end;
|
|
|
|
local function from_str_arr(self)
|
|
local len = self.n or #self;
|
|
if len <= 7997 then
|
|
return utf8.char(table.unpack(self));
|
|
end;
|
|
local clen = math.ceil(len / 7997);
|
|
local r = table.create(clen);
|
|
for i = 1, clen do
|
|
r[i] = utf8.char(table.unpack(self, i * 7997 - 7996, i * 7997 - (i == clen and 7997 - ((len - 1) % 7997 + 1) or 0)));
|
|
end;
|
|
return table.concat(r);
|
|
end;
|
|
|
|
local function utf8_sub(self, i, j)
|
|
j = utf8.offset(self, j);
|
|
return string.sub(self, utf8.offset(self, i), j and j - 1);
|
|
end;
|
|
|
|
--
|
|
local flag_map = {
|
|
a = 'anchored', i = 'caseless', m = 'multiline', s = 'dotall', u = 'unicode', U = 'ungreedy', x ='extended',
|
|
};
|
|
|
|
local posix_class_names = {
|
|
alnum = true, alpha = true, ascii = true, blank = true, cntrl = true, digit = true, graph = true, lower = true, print = true, punct = true, space = true, upper = true, word = true, xdigit = true,
|
|
};
|
|
|
|
local escape_chars = {
|
|
-- grouped
|
|
-- digit, spaces and words
|
|
[0x44] = { "class", "digit", true }, [0x53] = { "class", "space", true }, [0x57] = { "class", "word", true },
|
|
[0x64] = { "class", "digit", false }, [0x73] = { "class", "space", false }, [0x77] = { "class", "word", false },
|
|
-- horizontal/vertical whitespace and newline
|
|
[0x48] = { "class", "blank", true }, [0x56] = { "class", "vertical_tab", true },
|
|
[0x68] = { "class", "blank", false }, [0x76] = { "class", "vertical_tab", false },
|
|
[0x4E] = { 0x4E }, [0x52] = { 0x52 },
|
|
|
|
-- not grouped
|
|
[0x42] = 0x08,
|
|
[0x6E] = 0x0A, [0x72] = 0x0D, [0x74] = 0x09,
|
|
};
|
|
|
|
local b_escape_chars = {
|
|
-- word boundary and not word boundary
|
|
[0x62] = { 0x62, { "class", "word", false } }, [0x42] = { 0x42, { "class", "word", false } },
|
|
|
|
-- keep match out
|
|
[0x4B] = { 0x4B },
|
|
|
|
-- start & end of string
|
|
[0x47] = { 0x47 }, [0x4A] = { 0x4A }, [0x5A] = { 0x5A }, [0x7A] = { 0x7A },
|
|
};
|
|
|
|
local valid_categories = {
|
|
C = true, Cc = true, Cf = true, Cn = true, Co = true, Cs = true,
|
|
L = true, Ll = true, Lm = true, Lo = true, Lt = true, Lu = true,
|
|
M = true, Mc = true, Me = true, Mn = true,
|
|
N = true, Nd = true, Nl = true, No = true,
|
|
P = true, Pc = true, Pd = true, Pe = true, Pf = true, Pi = true, Po = true, Ps = true,
|
|
S = true, Sc = true, Sk = true, Sm = true, So = true,
|
|
Z = true, Zl = true, Zp = true, Zs = true,
|
|
|
|
Xan = true, Xps = true, Xsp = true, Xuc = true, Xwd = true,
|
|
};
|
|
|
|
local class_ascii_punct = {
|
|
[0x21] = true, [0x22] = true, [0x23] = true, [0x24] = true, [0x25] = true, [0x26] = true, [0x27] = true, [0x28] = true, [0x29] = true, [0x2A] = true, [0x2B] = true, [0x2C] = true, [0x2D] = true, [0x2E] = true, [0x2F] = true,
|
|
[0x3A] = true, [0x3B] = true, [0x3C] = true, [0x3D] = true, [0x3E] = true, [0x3F] = true, [0x40] = true, [0x5B] = true, [0x5C] = true, [0x5D] = true, [0x5E] = true, [0x5F] = true, [0x60] = true, [0x7B] = true, [0x7C] = true,
|
|
[0x7D] = true, [0x7E] = true,
|
|
};
|
|
|
|
local end_str = { 0x24 };
|
|
local dot = { 0x2E };
|
|
local beginning_str = { 0x5E };
|
|
local alternation = { 0x7C };
|
|
|
|
local function check_re(re_type, name, func)
|
|
if re_type == "Match" then
|
|
return function(...)
|
|
local arg_n = select('#', ...);
|
|
if arg_n < 1 then
|
|
error("missing argument #1 (Match expected)", 2);
|
|
end;
|
|
local arg0, arg1 = ...;
|
|
if not (proxy[arg0] and proxy[arg0].name == "Match") then
|
|
error(string.format("invalid argument #1 to %q (Match expected, got %s)", name, typeof(arg0)), 2);
|
|
else
|
|
arg0 = proxy[arg0];
|
|
end;
|
|
if name == "group" or name == "span" then
|
|
if arg1 == nil then
|
|
arg1 = 0;
|
|
end;
|
|
end;
|
|
return func(arg0, arg1);
|
|
end;
|
|
end;
|
|
return function(...)
|
|
local arg_n = select('#', ...);
|
|
if arg_n < 1 then
|
|
error("missing argument #1 (RegEx expected)", 2);
|
|
elseif arg_n < 2 then
|
|
error("missing argument #2 (string expected)", 2);
|
|
end;
|
|
local arg0, arg1, arg2, arg3, arg4, arg5 = ...;
|
|
if not (proxy[arg0] and proxy[arg0].name == "RegEx") then
|
|
if type(arg0) ~= "string" and type(arg0) ~= "number" then
|
|
error(string.format("invalid argument #1 to %q (RegEx expected, got %s)", name, typeof(arg0)), 2);
|
|
end;
|
|
arg0 = re.fromstring(arg0);
|
|
elseif name == "sub" then
|
|
if type(arg2) == "number" then
|
|
arg2 ..= '';
|
|
elseif type(arg2) ~= "string" then
|
|
error(string.format("invalid argument #3 to 'sub' (string expected, got %s)", typeof(arg2)), 2);
|
|
end;
|
|
elseif type(arg1) == "number" then
|
|
arg1 ..= '';
|
|
elseif type(arg1) ~= "string" then
|
|
error(string.format("invalid argument #2 to %q (string expected, got %s)", name, typeof(arg1)), 2);
|
|
end;
|
|
if name ~= "sub" and name ~= "split" then
|
|
local init_type = typeof(arg2);
|
|
if init_type ~= 'nil' then
|
|
arg2 = tonumber(arg2);
|
|
if not arg2 then
|
|
error(string.format("invalid argument #3 to %q (number expected, got %s)", name, init_type), 2);
|
|
elseif arg2 < 0 then
|
|
arg2 = #arg1 + math.floor(arg2 + 0.5) + 1;
|
|
else
|
|
arg2 = math.max(math.floor(arg2 + 0.5), 1);
|
|
end;
|
|
end;
|
|
end;
|
|
arg0 = proxy[arg0];
|
|
if name == "match" or name == "matchiter" then
|
|
arg3 = ...;
|
|
elseif name == "sub" then
|
|
arg5 = ...;
|
|
end;
|
|
return func(arg0, arg1, arg2, arg3, arg4, arg5);
|
|
end;
|
|
end;
|
|
|
|
--[[ Matches ]]--
|
|
local function match_tostr(self)
|
|
local spans = proxy[self].spans;
|
|
local s_start, s_end = spans[0][1], spans[0][2];
|
|
if s_end <= s_start then
|
|
return string.format("Match (%d..%d, empty)", s_start, s_end - 1);
|
|
end;
|
|
return string.format("Match (%d..%d): %s", s_start, s_end - 1, utf8_sub(spans.input, s_start, s_end));
|
|
end;
|
|
|
|
local function new_match(span_arr, group_id, re, str)
|
|
span_arr.source, span_arr.input = re, str;
|
|
local object = newproxy(true);
|
|
local object_mt = getmetatable(object);
|
|
object_mt.__metatable = lockmsg;
|
|
object_mt.__index = setmetatable(span_arr, match_m);
|
|
object_mt.__tostring = match_tostr;
|
|
|
|
proxy[object] = { name = "Match", spans = span_arr, group_id = group_id };
|
|
return object;
|
|
end;
|
|
|
|
match_m.group = check_re('Match', 'group', function(self, group_id)
|
|
local span = self.spans[type(group_id) == "number" and group_id or self.group_id[group_id]];
|
|
if not span then
|
|
return nil;
|
|
end;
|
|
return utf8_sub(self.spans.input, span[1], span[2]);
|
|
end);
|
|
|
|
match_m.span = check_re('Match', 'span', function(self, group_id)
|
|
local span = self.spans[type(group_id) == "number" and group_id or self.group_id[group_id]];
|
|
if not span then
|
|
return nil;
|
|
end;
|
|
return span[1], span[2] - 1;
|
|
end);
|
|
|
|
match_m.groups = check_re('Match', 'groups', function(self)
|
|
local spans = self.spans;
|
|
if spans.n > 0 then
|
|
local ret = table.create(spans.n);
|
|
for i = 0, spans.n do
|
|
local v = spans[i];
|
|
if v then
|
|
ret[i] = utf8_sub(spans.input, v[1], v[2]);
|
|
end;
|
|
end;
|
|
return table.unpack(ret, 1, spans.n);
|
|
end;
|
|
return utf8_sub(spans.input, spans[0][1], spans[0][2]);
|
|
end);
|
|
|
|
match_m.groupdict = check_re('Match', 'groupdict', function(self)
|
|
local spans = self.spans;
|
|
local ret = { };
|
|
for k, v in pairs(self.group_id) do
|
|
v = spans[v];
|
|
if v then
|
|
ret[k] = utf8_sub(spans.input, v[1], v[2]);
|
|
end;
|
|
end;
|
|
return ret;
|
|
end);
|
|
|
|
match_m.grouparr = check_re('Match', 'groupdict', function(self)
|
|
local spans = self.spans;
|
|
local ret = table.create(spans.n);
|
|
for i = 0, spans.n do
|
|
local v = spans[i];
|
|
if v then
|
|
ret[i] = utf8_sub(spans.input, v[1], v[2]);
|
|
end;
|
|
end;
|
|
ret.n = spans.n;
|
|
return ret;
|
|
end);
|
|
|
|
--
|
|
local line_verbs = {
|
|
CR = 0, LF = 1, CRLF = 2, ANYRLF = 3, ANY = 4, NUL = 5,
|
|
};
|
|
local function is_newline(str_arr, i, verb_flags)
|
|
local line_verb_n = verb_flags.newline;
|
|
local chr = str_arr[i];
|
|
if line_verb_n == 0 then
|
|
-- carriage return
|
|
return chr == 0x0D;
|
|
elseif line_verb_n == 2 then
|
|
-- carriage return followed by line feed
|
|
return chr == 0x0A and str_arr[i - 1] == 0x20;
|
|
elseif line_verb_n == 3 then
|
|
-- any of the above
|
|
return chr == 0x0A or chr == 0x0D;
|
|
elseif line_verb_n == 4 then
|
|
-- any of Unicode newlines
|
|
return chr == 0x0A or chr == 0x0B or chr == 0x0C or chr == 0x0D or chr == 0x85 or chr == 0x2028 or chr == 0x2029;
|
|
elseif line_verb_n == 5 then
|
|
-- null
|
|
return chr == 0;
|
|
end;
|
|
-- linefeed
|
|
return chr == 0x0A;
|
|
end;
|
|
|
|
|
|
local function tkn_char_match(tkn_part, str_arr, i, flags, verb_flags)
|
|
local chr = str_arr[i];
|
|
if not chr then
|
|
return false;
|
|
elseif flags.ignoreCase and chr >= 0x61 and chr <= 0x7A then
|
|
chr -= 0x20;
|
|
end;
|
|
if type(tkn_part) == "number" then
|
|
return tkn_part == chr;
|
|
elseif tkn_part[1] == "charset" then
|
|
for _, v in ipairs(tkn_part[3]) do
|
|
if tkn_char_match(v, str_arr, i, flags, verb_flags) then
|
|
return not tkn_part[2];
|
|
end;
|
|
end;
|
|
return tkn_part[2];
|
|
elseif tkn_part[1] == "range" then
|
|
return chr >= tkn_part[2] and chr <= tkn_part[3] or flags.ignoreCase and chr >= 0x41 and chr <= 0x5A and (chr + 0x20) >= tkn_part[2] and (chr + 0x20) <= tkn_part[3];
|
|
elseif tkn_part[1] == "class" then
|
|
local char_class = tkn_part[2];
|
|
local negate = tkn_part[3];
|
|
local match = false;
|
|
-- if and elseifs :(
|
|
-- Might make these into tables in the future
|
|
if char_class == "xdigit" then
|
|
match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x46 or chr >= 0x61 and chr <= 0x66;
|
|
elseif char_class == "ascii" then
|
|
match = chr <= 0x7F;
|
|
-- cannot be accessed through POSIX classes
|
|
elseif char_class == "vertical_tab" then
|
|
match = chr >= 0x0A and chr <= 0x0D or chr == 0x2028 or chr == 0x2029;
|
|
--
|
|
elseif flags.unicode then
|
|
local current_category = u_categories[chr] or 'Cn';
|
|
local first_category = current_category:sub(1, 1);
|
|
if char_class == "alnum" then
|
|
match = first_category == 'L' or current_category == 'Nl' or current_category == 'Nd';
|
|
elseif char_class == "alpha" then
|
|
match = first_category == 'L' or current_category == 'Nl';
|
|
elseif char_class == "blank" then
|
|
match = current_category == 'Zs' or chr == 0x09;
|
|
elseif char_class == "cntrl" then
|
|
match = current_category == 'Cc';
|
|
elseif char_class == "digit" then
|
|
match = current_category == 'Nd';
|
|
elseif char_class == "graph" then
|
|
match = first_category ~= 'P' and first_category ~= 'C';
|
|
elseif char_class == "lower" then
|
|
match = current_category == 'Ll';
|
|
elseif char_class == "print" then
|
|
match = first_category ~= 'C';
|
|
elseif char_class == "punct" then
|
|
match = first_category == 'P';
|
|
elseif char_class == "space" then
|
|
match = first_category == 'Z' or chr >= 0x09 and chr <= 0x0D;
|
|
elseif char_class == "upper" then
|
|
match = current_category == 'Lu';
|
|
elseif char_class == "word" then
|
|
match = first_category == 'L' or current_category == 'Nl' or current_category == 'Nd' or current_category == 'Pc';
|
|
end;
|
|
elseif char_class == "alnum" then
|
|
match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A;
|
|
elseif char_class == "alpha" then
|
|
match = chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A;
|
|
elseif char_class == "blank" then
|
|
match = chr == 0x09 or chr == 0x20;
|
|
elseif char_class == "cntrl" then
|
|
match = chr <= 0x1F or chr == 0x7F;
|
|
elseif char_class == "digit" then
|
|
match = chr >= 0x30 and chr <= 0x39;
|
|
elseif char_class == "graph" then
|
|
match = chr >= 0x21 and chr <= 0x7E;
|
|
elseif char_class == "lower" then
|
|
match = chr >= 0x61 and chr <= 0x7A;
|
|
elseif char_class == "print" then
|
|
match = chr >= 0x20 and chr <= 0x7E;
|
|
elseif char_class == "punct" then
|
|
match = class_ascii_punct[chr];
|
|
elseif char_class == "space" then
|
|
match = chr >= 0x09 and chr <= 0x0D or chr == 0x20;
|
|
elseif char_class == "upper" then
|
|
match = chr >= 0x41 and chr <= 0x5A;
|
|
elseif char_class == "word" then
|
|
match = chr >= 0x30 and chr <= 0x39 or chr >= 0x41 and chr <= 0x5A or chr >= 0x61 and chr <= 0x7A or chr == 0x5F;
|
|
end;
|
|
if negate then
|
|
return not match;
|
|
end;
|
|
return match;
|
|
elseif tkn_part[1] == "category" then
|
|
local chr_category = u_categories[chr] or 'Cn';
|
|
local category_v = tkn_part[3];
|
|
local category_len = #category_v;
|
|
if category_len == 3 then
|
|
local match = false;
|
|
if category_v == "Xan" or category_v == "Xwd" then
|
|
match = chr_category:find("^[LN]") or category_v == "Xwd" and chr == 0x5F;
|
|
elseif category_v == "Xps" or category_v == "Xsp" then
|
|
match = chr_category:sub(1, 1) == 'Z' or chr >= 0x09 and chr <= 0x0D;
|
|
elseif category_v == "Xuc" then
|
|
match = tkn_char_match(xuc_chr, str_arr, i, flags, verb_flags);
|
|
end;
|
|
if tkn_part[2] then
|
|
return not match;
|
|
end
|
|
return match;
|
|
elseif chr_category:sub(1, category_len) == category_v then
|
|
return not tkn_part[2];
|
|
end;
|
|
return tkn_part[2];
|
|
elseif tkn_part[1] == 0x2E then
|
|
return flags.dotAll or not is_newline(str_arr, i, verb_flags);
|
|
elseif tkn_part[1] == 0x4E then
|
|
return not is_newline(str_arr, i, verb_flags);
|
|
elseif tkn_part[1] == 0x52 then
|
|
if verb_flags.newline_seq == 0 then
|
|
-- CR, LF or CRLF
|
|
return chr == 0x0A or chr == 0x0D;
|
|
end;
|
|
-- any unicode newline
|
|
return chr == 0x0A or chr == 0x0B or chr == 0x0C or chr == 0x0D or chr == 0x85 or chr == 0x2028 or chr == 0x2029;
|
|
end;
|
|
return false;
|
|
end;
|
|
|
|
local function find_alternation(token, i, count)
|
|
while true do
|
|
local v = token[i];
|
|
local is_table = type(v) == "table";
|
|
if v == alternation then
|
|
return i, count;
|
|
elseif is_table and v[1] == 0x28 then
|
|
if count then
|
|
count += v.count;
|
|
end;
|
|
i = v[3];
|
|
elseif is_table and v[1] == "quantifier" and type(v[5]) == "table" and v[5][1] == 0x28 then
|
|
if count then
|
|
count += v[5].count;
|
|
end;
|
|
i = v[5][3];
|
|
elseif not v or is_table and v[1] == 0x29 then
|
|
return nil, count;
|
|
elseif count then
|
|
if is_table and v[1] == "quantifier" then
|
|
count += v[3];
|
|
else
|
|
count += 1;
|
|
end;
|
|
end;
|
|
i += 1;
|
|
end;
|
|
end;
|
|
|
|
local function re_rawfind(token, str_arr, init, flags, verb_flags, as_bool)
|
|
local tkn_i, str_i, start_i = 0, init, init;
|
|
local states = { };
|
|
while tkn_i do
|
|
if tkn_i == 0 then
|
|
tkn_i += 1;
|
|
local next_alt = find_alternation(token, tkn_i);
|
|
if next_alt then
|
|
table.insert(states, 1, { "alternation", next_alt, str_i });
|
|
end;
|
|
continue;
|
|
end;
|
|
local ctkn = token[tkn_i];
|
|
local tkn_type = type(ctkn) == "table" and ctkn[1];
|
|
if not ctkn then
|
|
break;
|
|
elseif ctkn == "ACCEPT" then
|
|
local not_lookaround = true;
|
|
local close_i = tkn_i;
|
|
repeat
|
|
close_i += 1;
|
|
local is_table = type(token[close_i]) == "table";
|
|
local close_i_tkn = token[close_i];
|
|
if is_table and (close_i_tkn[1] == 0x28 or close_i_tkn[1] == "quantifier" and type(close_i_tkn[5]) == "table" and close_i_tkn[5][1] == 0x28) then
|
|
close_i = close_i_tkn[1] == "quantifier" and close_i_tkn[5][3] or close_i_tkn[3];
|
|
elseif is_table and close_i_tkn[1] == 0x29 and (close_i_tkn[4] == 0x21 or close_i_tkn[4] == 0x3D) then
|
|
not_lookaround = false;
|
|
tkn_i = close_i;
|
|
break;
|
|
end;
|
|
until not close_i_tkn;
|
|
if not_lookaround then
|
|
break;
|
|
end;
|
|
elseif ctkn == "PRUNE" or ctkn == "SKIP" then
|
|
table.insert(states, 1, { ctkn, str_i });
|
|
tkn_i += 1;
|
|
elseif tkn_type == 0x28 then
|
|
table.insert(states, 1, { "group", tkn_i, str_i, nil, ctkn[2], ctkn[3], ctkn[4] });
|
|
tkn_i += 1;
|
|
local next_alt, count = find_alternation(token, tkn_i, (ctkn[4] == 0x21 or ctkn[4] == 0x3D) and ctkn[5] and 0);
|
|
if next_alt then
|
|
table.insert(states, 1, { "alternation", next_alt, str_i });
|
|
end;
|
|
if count then
|
|
str_i -= count;
|
|
end;
|
|
elseif tkn_type == 0x29 and ctkn[4] ~= 0x21 then
|
|
if ctkn[4] == 0x21 or ctkn[4] == 0x3D then
|
|
while true do
|
|
local selected_match_start;
|
|
local selected_state = table.remove(states, 1);
|
|
if selected_state[1] == "group" and selected_state[2] == ctkn[3] then
|
|
if (ctkn[4] == 0x21 or ctkn[4] == 0x3D) and not ctkn[5] then
|
|
str_i = selected_state[3];
|
|
end;
|
|
if selected_match_start then
|
|
table.insert(states, 1, selected_match_start);
|
|
end;
|
|
break;
|
|
elseif selected_state[1] == "matchStart" and not selected_match_start and ctkn[4] == 0x3D then
|
|
selected_match_start = selected_state;
|
|
end;
|
|
end;
|
|
elseif ctkn[4] == 0x3E then
|
|
repeat
|
|
local selected_state = table.remove(states, 1);
|
|
until not selected_state or selected_state[1] == "group" and selected_state[2] == ctkn[3];
|
|
else
|
|
for i, v in ipairs(states) do
|
|
if v[1] == "group" and v[2] == ctkn[3] then
|
|
if v.jmp then
|
|
-- recursive match
|
|
tkn_i = v.jmp;
|
|
end;
|
|
v[4] = str_i;
|
|
if v[7] == "quantifier" and v[10] + 1 < v[9] then
|
|
if token[ctkn[3]][4] ~= "lazy" or v[10] + 1 < v[8] then
|
|
tkn_i = ctkn[3];
|
|
end;
|
|
local ctkn1 = token[ctkn[3]];
|
|
local new_group = { "group", v[2], str_i, nil, ctkn1[5][2], ctkn1[5][3], "quantifier", ctkn1[2], ctkn1[3], v[10] + 1, v[11], ctkn1[4] };
|
|
table.insert(states, 1, new_group);
|
|
if v[11] then
|
|
table.insert(states, 1, { "alternation", v[11], str_i });
|
|
end;
|
|
end;
|
|
break;
|
|
end;
|
|
end;
|
|
end;
|
|
tkn_i += 1;
|
|
elseif tkn_type == 0x4B then
|
|
table.insert(states, 1, { "matchStart", str_i });
|
|
tkn_i += 1;
|
|
elseif tkn_type == 0x7C then
|
|
local close_i = tkn_i;
|
|
repeat
|
|
close_i += 1;
|
|
local is_table = type(token[close_i]) == "table";
|
|
local close_i_tkn = token[close_i];
|
|
if is_table and (close_i_tkn[1] == 0x28 or close_i_tkn[1] == "quantifier" and type(close_i_tkn[5]) == "table" and close_i_tkn[5][1] == 0x28) then
|
|
close_i = close_i_tkn[1] == "quantifier" and close_i_tkn[5][3] or close_i_tkn[3];
|
|
end;
|
|
until is_table and close_i_tkn[1] == 0x29 or not close_i_tkn;
|
|
if token[close_i] then
|
|
for _, v in ipairs(states) do
|
|
if v[1] == "group" and v[6] == close_i then
|
|
tkn_i = v[6];
|
|
break;
|
|
end;
|
|
end;
|
|
else
|
|
tkn_i = close_i;
|
|
end;
|
|
elseif tkn_type == "recurmatch" then
|
|
table.insert(states, 1, { "group", ctkn[3], str_i, nil, nil, token[ctkn[3]][3], nil, jmp = tkn_i });
|
|
tkn_i = ctkn[3] + 1;
|
|
local next_alt, count = find_alternation(token, tkn_i);
|
|
if next_alt then
|
|
table.insert(states, 1, { "alternation", next_alt, str_i });
|
|
end;
|
|
else
|
|
local match;
|
|
if ctkn == "FAIL" then
|
|
match = false;
|
|
elseif tkn_type == 0x29 then
|
|
repeat
|
|
local selected_state = table.remove(states, 1);
|
|
until selected_state[1] == "group" and selected_state[2] == ctkn[3];
|
|
elseif tkn_type == "quantifier" then
|
|
if type(ctkn[5]) == "table" and ctkn[5][1] == 0x28 then
|
|
local next_alt = find_alternation(token, tkn_i + 1);
|
|
if next_alt then
|
|
table.insert(states, 1, { "alternation", next_alt, str_i });
|
|
end;
|
|
table.insert(states, next_alt and 2 or 1, { "group", tkn_i, str_i, nil, ctkn[5][2], ctkn[5][3], "quantifier", ctkn[2], ctkn[3], 0, next_alt, ctkn[4] });
|
|
if ctkn[4] == "lazy" and ctkn[2] == 0 then
|
|
tkn_i = ctkn[5][3];
|
|
end;
|
|
match = true;
|
|
else
|
|
local start_i, end_i;
|
|
local pattern_count = 1;
|
|
local is_backref = type(ctkn[5]) == "table" and ctkn[5][1] == "backref";
|
|
if is_backref then
|
|
pattern_count = 0;
|
|
local group_n = ctkn[5][2];
|
|
for _, v in ipairs(states) do
|
|
if v[1] == "group" and v[5] == group_n then
|
|
start_i, end_i = v[3], v[4];
|
|
pattern_count = end_i - start_i;
|
|
break;
|
|
end;
|
|
end;
|
|
end;
|
|
local min_max_i = str_i + ctkn[2] * pattern_count;
|
|
local mcount = 0;
|
|
while mcount < ctkn[3] do
|
|
if is_backref then
|
|
if start_i and end_i then
|
|
local org_i = str_i;
|
|
if utf8_sub(str_arr.s, start_i, end_i) ~= utf8_sub(str_arr.s, org_i, str_i + pattern_count) then
|
|
break;
|
|
end;
|
|
else
|
|
break;
|
|
end;
|
|
elseif not tkn_char_match(ctkn[5], str_arr, str_i, flags, verb_flags) then
|
|
break;
|
|
end;
|
|
str_i += pattern_count;
|
|
mcount += 1;
|
|
end;
|
|
match = mcount >= ctkn[2];
|
|
if match and ctkn[4] ~= "possessive" then
|
|
if ctkn[4] == "lazy" then
|
|
min_max_i, str_i = str_i, min_max_i;
|
|
end;
|
|
table.insert(states, 1, { "quantifier", tkn_i, str_i, math.min(min_max_i, str_arr.n + 1), (ctkn[4] == "lazy" and 1 or -1) * pattern_count });
|
|
end;
|
|
end;
|
|
elseif tkn_type == "backref" then
|
|
local start_i, end_i;
|
|
local group_n = ctkn[2];
|
|
for _, v in ipairs(states) do
|
|
if v[1] == "group" and v[5] == group_n then
|
|
start_i, end_i = v[3], v[4];
|
|
break;
|
|
end;
|
|
end;
|
|
if start_i and end_i then
|
|
local org_i = str_i;
|
|
str_i += end_i - start_i;
|
|
match = utf8_sub(str_arr.s, start_i, end_i) == utf8_sub(str_arr.s, org_i, str_i);
|
|
end;
|
|
else
|
|
local chr = str_arr[str_i];
|
|
if tkn_type == 0x24 or tkn_type == 0x5A or tkn_type == 0x7A then
|
|
match = str_i == str_arr.n + 1 or tkn_type == 0x24 and flags.multiline and is_newline(str_arr, str_i + 1, verb_flags) or tkn_type == 0x5A and str_i == str_arr.n and is_newline(str_arr, str_i, verb_flags);
|
|
elseif tkn_type == 0x5E or tkn_type == 0x41 or tkn_type == 0x47 then
|
|
match = str_i == 1 or tkn_type == 0x5E and flags.multiline and is_newline(str_arr, str_i - 1, verb_flags) or tkn_type == 0x47 and str_i == init;
|
|
elseif tkn_type == 0x42 or tkn_type == 0x62 then
|
|
local start_m = str_i == 1 or flags.multiline and is_newline(str_arr, str_i - 1, verb_flags);
|
|
local end_m = str_i == str_arr.n + 1 or flags.multiline and is_newline(str_arr, str_i, verb_flags);
|
|
local w_m = tkn_char_match(ctkn[2], str_arr[str_i - 1], flags) and 0 or tkn_char_match(ctkn[2], chr, flags) and 1;
|
|
if w_m == 0 then
|
|
match = end_m or not tkn_char_match(ctkn[2], chr, flags);
|
|
elseif w_m then
|
|
match = start_m or not tkn_char_match(ctkn[2], str_arr[str_i - 1], flags);
|
|
end;
|
|
if tkn_type == 0x42 then
|
|
match = not match;
|
|
end;
|
|
else
|
|
match = tkn_char_match(ctkn, str_arr, str_i, flags, verb_flags);
|
|
str_i += 1;
|
|
end;
|
|
end;
|
|
if not match then
|
|
while true do
|
|
local prev_type, prev_state = states[1] and states[1][1], states[1];
|
|
if not prev_type or prev_type == "PRUNE" or prev_type == "SKIP" then
|
|
if prev_type then
|
|
table.clear(states);
|
|
end;
|
|
if start_i > str_arr.n then
|
|
if as_bool then
|
|
return false;
|
|
end;
|
|
return nil;
|
|
end;
|
|
start_i = prev_type == "SKIP" and prev_state[2] or start_i + 1;
|
|
tkn_i, str_i = 0, start_i;
|
|
break;
|
|
elseif prev_type == "alternation" then
|
|
tkn_i, str_i = prev_state[2], prev_state[3];
|
|
local next_alt, count = find_alternation(token, tkn_i + 1);
|
|
if next_alt then
|
|
prev_state[2] = next_alt;
|
|
else
|
|
table.remove(states, 1);
|
|
end;
|
|
if count then
|
|
str_i -= count;
|
|
end;
|
|
break;
|
|
elseif prev_type == "group" then
|
|
if prev_state[7] == "quantifier" then
|
|
if prev_state[12] == "greedy" and prev_state[10] >= prev_state[8]
|
|
or prev_state[12] == "lazy" and prev_state[10] < prev_state[9] and not prev_state[13] then
|
|
tkn_i, str_i = prev_state[12] == "greedy" and prev_state[6] or prev_state[2], prev_state[3];
|
|
if prev_state[12] == "greedy" then
|
|
table.remove(states, 1);
|
|
break;
|
|
elseif prev_state[10] >= prev_state[8] then
|
|
prev_state[13] = true;
|
|
break;
|
|
end;
|
|
end;
|
|
elseif prev_state[7] == 0x21 then
|
|
table.remove(states, 1);
|
|
tkn_i, str_i = prev_state[6], prev_state[3];
|
|
break;
|
|
end;
|
|
elseif prev_type == "quantifier" then
|
|
if math.sign(prev_state[4] - prev_state[3]) == math.sign(prev_state[5]) then
|
|
prev_state[3] += prev_state[5];
|
|
tkn_i, str_i = prev_state[2], prev_state[3];
|
|
break;
|
|
end;
|
|
end;
|
|
-- keep match out state and recursive state, can be safely removed
|
|
-- prevents infinite loop
|
|
table.remove(states, 1);
|
|
end;
|
|
end;
|
|
tkn_i += 1;
|
|
end;
|
|
end;
|
|
if as_bool then
|
|
return true;
|
|
end;
|
|
local match_start_ran = false;
|
|
local span = table.create(token.group_n);
|
|
span[0], span.n = { start_i, str_i }, token.group_n;
|
|
for _, v in ipairs(states) do
|
|
if v[1] == "matchStart" and not match_start_ran then
|
|
span[0][1], match_start_ran = v[2], true;
|
|
elseif v[1] == "group" and v[5] and not span[v[5]] then
|
|
span[v[5]] = { v[3], v[4] };
|
|
end;
|
|
end;
|
|
return span;
|
|
end;
|
|
|
|
--[[ Methods ]]--
|
|
re_m.test = check_re('RegEx', 'test', function(self, str, init)
|
|
return re_rawfind(self.token, to_str_arr(str, init), 1, self.flags, self.verb_flags, true);
|
|
end);
|
|
|
|
re_m.match = check_re('RegEx', 'match', function(self, str, init, source)
|
|
local span = re_rawfind(self.token, to_str_arr(str, init), 1, self.flags, self.verb_flags, false);
|
|
if not span then
|
|
return nil;
|
|
end;
|
|
return new_match(span, self.group_id, source, str);
|
|
end);
|
|
|
|
re_m.matchall = check_re('RegEx', 'matchall', function(self, str, init, source)
|
|
str = to_str_arr(str, init);
|
|
local i = 1;
|
|
return function()
|
|
local span = i <= str.n + 1 and re_rawfind(self.token, str, i, self.flags, self.verb_flags, false);
|
|
if not span then
|
|
return nil;
|
|
end;
|
|
i = span[0][2] + (span[0][1] >= span[0][2] and 1 or 0);
|
|
return new_match(span, self.group_id, source, str.s);
|
|
end;
|
|
end);
|
|
|
|
local function insert_tokenized_sub(repl_r, str, span, tkn)
|
|
for _, v in ipairs(tkn) do
|
|
if type(v) == "table" then
|
|
if v[1] == "condition" then
|
|
if span[v[2]] then
|
|
if v[3] then
|
|
insert_tokenized_sub(repl_r, str, span, v[3]);
|
|
else
|
|
table.move(str, span[v[2]][1], span[v[2]][2] - 1, #repl_r + 1, repl_r);
|
|
end;
|
|
elseif v[4] then
|
|
insert_tokenized_sub(repl_r, str, span, v[4]);
|
|
end;
|
|
else
|
|
table.move(v, 1, #v, #repl_r + 1, repl_r);
|
|
end;
|
|
elseif span[v] then
|
|
table.move(str, span[v][1], span[v][2] - 1, #repl_r + 1, repl_r);
|
|
end;
|
|
end;
|
|
repl_r.n = #repl_r;
|
|
return repl_r;
|
|
end;
|
|
|
|
re_m.sub = check_re('RegEx', 'sub', function(self, repl, str, n, repl_flag_str, source)
|
|
if repl_flag_str ~= nil and type(repl_flag_str) ~= "number" and type(repl_flag_str) ~= "string" then
|
|
error(string.format("invalid argument #5 to 'sub' (string expected, got %s)", typeof(repl_flag_str)), 3);
|
|
end
|
|
local repl_flags = {
|
|
l = false, o = false, u = false,
|
|
};
|
|
for f in string.gmatch(repl_flag_str or '', utf8.charpattern) do
|
|
if repl_flags[f] ~= false then
|
|
error("invalid regular expression substitution flag " .. f, 3);
|
|
end;
|
|
repl_flags[f] = true;
|
|
end;
|
|
local repl_type = type(repl);
|
|
if repl_type == "number" then
|
|
repl ..= '';
|
|
elseif repl_type ~= "string" and repl_type ~= "function" and (not repl_flags.o or repl_type ~= "table") then
|
|
error(string.format("invalid argument #2 to 'sub' (string/function%s expected, got %s)", repl_flags.o and "/table" or '', typeof(repl)), 3);
|
|
end;
|
|
if tonumber(n) then
|
|
n = tonumber(n);
|
|
if n <= -1 or n ~= n then
|
|
n = math.huge;
|
|
end;
|
|
elseif n ~= nil then
|
|
error(string.format("invalid argument #4 to 'sub' (number expected, got %s)", typeof(n)), 3);
|
|
else
|
|
n = math.huge;
|
|
end;
|
|
if n < 1 then
|
|
return str, 0;
|
|
end;
|
|
local min_repl_n = 0;
|
|
if repl_type == "string" then
|
|
repl = to_str_arr(repl);
|
|
if not repl_flags.l then
|
|
local i1 = 0;
|
|
local repl_r = table.create(3);
|
|
local group_n = self.token.group_n;
|
|
local conditional_c = { };
|
|
while i1 < repl.n do
|
|
local i2 = i1;
|
|
repeat
|
|
i2 += 1;
|
|
until not repl[i2] or repl[i2] == 0x24 or repl[i2] == 0x5C or (repl[i2] == 0x3A or repl[i2] == 0x7D) and conditional_c[1];
|
|
min_repl_n += i2 - i1 - 1;
|
|
if i2 - i1 > 1 then
|
|
table.insert(repl_r, table.move(repl, i1 + 1, i2 - 1, 1, table.create(i2 - i1 - 1)));
|
|
end;
|
|
if repl[i2] == 0x3A then
|
|
local current_conditional_c = conditional_c[1];
|
|
if current_conditional_c[2] then
|
|
error("malformed substitution pattern", 3);
|
|
end;
|
|
current_conditional_c[2] = table.move(repl_r, current_conditional_c[3], #repl_r, 1, table.create(#repl_r + 1 - current_conditional_c[3]));
|
|
for i3 = #repl_r, current_conditional_c[3], -1 do
|
|
repl_r[i3] = nil;
|
|
end;
|
|
elseif repl[i2] == 0x7D then
|
|
local current_conditional_c = table.remove(conditional_c, 1);
|
|
local second_c = table.move(repl_r, current_conditional_c[3], #repl_r, 1, table.create(#repl_r + 1 - current_conditional_c[3]));
|
|
for i3 = #repl_r, current_conditional_c[3], -1 do
|
|
repl_r[i3] = nil;
|
|
end;
|
|
table.insert(repl_r, { "condition", current_conditional_c[1], current_conditional_c[2] ~= true and (current_conditional_c[2] or second_c), current_conditional_c[2] and second_c });
|
|
elseif repl[i2] then
|
|
i2 += 1;
|
|
local subst_c = repl[i2];
|
|
if not subst_c then
|
|
if repl[i2 - 1] == 0x5C then
|
|
error("replacement string must not end with a trailing backslash", 3);
|
|
end;
|
|
local prev_repl_f = repl_r[#repl_r];
|
|
if type(prev_repl_f) == "table" then
|
|
table.insert(prev_repl_f, repl[i2 - 1]);
|
|
else
|
|
table.insert(repl_r, { repl[i2 - 1] });
|
|
end;
|
|
elseif subst_c == 0x5C and repl[i2 - 1] == 0x24 then
|
|
local prev_repl_f = repl_r[#repl_r];
|
|
if type(prev_repl_f) == "table" then
|
|
table.insert(prev_repl_f, 0x24);
|
|
else
|
|
table.insert(repl_r, { 0x24 });
|
|
end;
|
|
i2 -= 1;
|
|
min_repl_n += 1;
|
|
elseif subst_c == 0x30 then
|
|
table.insert(repl_r, 0);
|
|
elseif subst_c > 0x30 and subst_c <= 0x39 then
|
|
local start_i2 = i2;
|
|
local group_i = subst_c - 0x30;
|
|
while repl[i2 + 1] and repl[i2 + 1] >= 0x30 and repl[i2 + 1] <= 0x39 do
|
|
group_i ..= repl[i2 + 1] - 0x30;
|
|
i2 += 1;
|
|
end;
|
|
group_i = tonumber(group_i);
|
|
if not repl_flags.u and group_i > group_n then
|
|
error("reference to non-existent subpattern", 3);
|
|
end;
|
|
table.insert(repl_r, group_i);
|
|
elseif subst_c == 0x7B and repl[i2 - 1] == 0x24 then
|
|
i2 += 1;
|
|
local start_i2 = i2;
|
|
while repl[i2] and
|
|
(repl[i2] >= 0x30 and repl[i2] <= 0x39
|
|
or repl[i2] >= 0x41 and repl[i2] <= 0x5A
|
|
or repl[i2] >= 0x61 and repl[i2] <= 0x7A
|
|
or repl[i2] == 0x5F) do
|
|
i2 += 1;
|
|
end;
|
|
if (repl[i2] == 0x7D or repl[i2] == 0x3A and (repl[i2 + 1] == 0x2B or repl[i2 + 1] == 0x2D)) and i2 ~= start_i2 then
|
|
local group_k = utf8_sub(repl.s, start_i2, i2);
|
|
if repl[start_i2] >= 0x30 and repl[start_i2] <= 0x39 then
|
|
group_k = tonumber(group_k);
|
|
if not repl_flags.u and group_k > group_n then
|
|
error("reference to non-existent subpattern", 3);
|
|
end;
|
|
else
|
|
group_k = self.group_id[group_k];
|
|
if not repl_flags.u and (not group_k or group_k > group_n) then
|
|
error("reference to non-existent subpattern", 3);
|
|
end;
|
|
end;
|
|
if repl[i2] == 0x3A then
|
|
i2 += 1;
|
|
table.insert(conditional_c, { group_k, repl[i2] == 0x2D, #repl_r + 1 });
|
|
else
|
|
table.insert(repl_r, group_k);
|
|
end;
|
|
else
|
|
error("malformed substitution pattern", 3);
|
|
end;
|
|
else
|
|
local c_escape_char;
|
|
if repl[i2 - 1] == 0x24 then
|
|
if subst_c ~= 0x24 then
|
|
local prev_repl_f = repl_r[#repl_r];
|
|
if type(prev_repl_f) == "table" then
|
|
table.insert(prev_repl_f, 0x24);
|
|
else
|
|
table.insert(repl_r, { 0x24 });
|
|
end;
|
|
end;
|
|
else
|
|
c_escape_char = escape_chars[repl[i2]];
|
|
if type(c_escape_char) ~= "number" then
|
|
c_escape_char = nil;
|
|
end;
|
|
end;
|
|
local prev_repl_f = repl_r[#repl_r];
|
|
if type(prev_repl_f) == "table" then
|
|
table.insert(prev_repl_f, c_escape_char or repl[i2]);
|
|
else
|
|
table.insert(repl_r, { c_escape_char or repl[i2] });
|
|
end;
|
|
min_repl_n += 1;
|
|
end;
|
|
end;
|
|
i1 = i2;
|
|
end;
|
|
if conditional_c[1] then
|
|
error("malformed substitution pattern", 3);
|
|
end;
|
|
if not repl_r[2] and type(repl_r[1]) == "table" and repl_r[1][1] ~= "condition" then
|
|
repl, repl.n = repl_r[1], #repl_r[1];
|
|
else
|
|
repl, repl_type = repl_r, "subst_string";
|
|
end;
|
|
end;
|
|
end;
|
|
str = to_str_arr(str);
|
|
local incr, i0, count = 0, 1, 0;
|
|
while i0 <= str.n + incr + 1 do
|
|
local span = re_rawfind(self.token, str, i0, self.flags, self.verb_flags, false);
|
|
if not span then
|
|
break;
|
|
end;
|
|
local repl_r;
|
|
if repl_type == "string" then
|
|
repl_r = repl;
|
|
elseif repl_type == "subst_string" then
|
|
repl_r = insert_tokenized_sub(table.create(min_repl_n), str, span, repl);
|
|
else
|
|
local re_match;
|
|
local repl_c;
|
|
if repl_type == "table" then
|
|
re_match = utf8_sub(str.s, span[0][1], span[0][2]);
|
|
repl_c = repl[re_match];
|
|
else
|
|
re_match = new_match(span, self.group_id, source, str.s);
|
|
repl_c = repl(re_match);
|
|
end;
|
|
if repl_c == re_match or repl_flags.o and not repl_c then
|
|
local repl_n = span[0][2] - span[0][1];
|
|
repl_r = table.move(str, span[0][1], span[0][2] - 1, 1, table.create(repl_n));
|
|
repl_r.n = repl_n;
|
|
elseif type(repl_c) == "string" then
|
|
repl_r = to_str_arr(repl_c);
|
|
elseif type(repl_c) == "number" then
|
|
repl_r = to_str_arr(repl_c .. '');
|
|
elseif repl_flags.o then
|
|
error(string.format("invalid replacement value (a %s)", type(repl_c)), 3);
|
|
else
|
|
repl_r = { n = 0 };
|
|
end;
|
|
end;
|
|
local match_len = span[0][2] - span[0][1];
|
|
local repl_len = math.min(repl_r.n, match_len);
|
|
for i1 = 0, repl_len - 1 do
|
|
str[span[0][1] + i1] = repl_r[i1 + 1];
|
|
end;
|
|
local i1 = span[0][1] + repl_len;
|
|
i0 = span[0][2];
|
|
if match_len > repl_r.n then
|
|
for i2 = 1, match_len - repl_r.n do
|
|
table.remove(str, i1);
|
|
incr -= 1;
|
|
i0 -= 1;
|
|
end;
|
|
elseif repl_r.n > match_len then
|
|
for i2 = 1, repl_r.n - match_len do
|
|
table.insert(str, i1 + i2 - 1, repl_r[repl_len + i2]);
|
|
incr += 1;
|
|
i0 += 1;
|
|
end;
|
|
end;
|
|
if match_len <= 0 then
|
|
i0 += 1;
|
|
end;
|
|
count += 1;
|
|
if n < count + 1 then
|
|
break;
|
|
end;
|
|
end;
|
|
return from_str_arr(str), count;
|
|
end);
|
|
|
|
re_m.split = check_re('RegEx', 'split', function(self, str, n)
|
|
if tonumber(n) then
|
|
n = tonumber(n);
|
|
if n <= -1 or n ~= n then
|
|
n = math.huge;
|
|
end;
|
|
elseif n ~= nil then
|
|
error(string.format("invalid argument #3 to 'split' (number expected, got %s)", typeof(n)), 3);
|
|
else
|
|
n = math.huge;
|
|
end;
|
|
str = to_str_arr(str);
|
|
local i, count = 1, 0;
|
|
local ret = { };
|
|
local prev_empty = 0;
|
|
while i <= str.n + 1 do
|
|
count += 1;
|
|
local span = n >= count and re_rawfind(self.token, str, i, self.flags, self.verb_flags, false);
|
|
if not span then
|
|
break;
|
|
end;
|
|
table.insert(ret, utf8_sub(str.s, i - prev_empty, span[0][1]));
|
|
prev_empty = span[0][1] >= span[0][2] and 1 or 0;
|
|
i = span[0][2] + prev_empty;
|
|
end;
|
|
table.insert(ret, string.sub(str.s, utf8.offset(str.s, i - prev_empty)));
|
|
return ret;
|
|
end);
|
|
|
|
--
|
|
local function re_index(self, index)
|
|
return re_m[index] or proxy[self].flags[index];
|
|
end;
|
|
|
|
local function re_tostr(self)
|
|
return proxy[self].pattern_repr .. proxy[self].flag_repr;
|
|
end;
|
|
--
|
|
|
|
local other_valid_group_char = {
|
|
-- non-capturing group
|
|
[0x3A] = true,
|
|
-- lookarounds
|
|
[0x21] = true, [0x3D] = true,
|
|
-- atomic
|
|
[0x3E] = true,
|
|
-- branch reset
|
|
[0x7C] = true,
|
|
};
|
|
|
|
local function tokenize_ptn(codes, flags)
|
|
if flags.unicode and not options.unicodeData then
|
|
return "options.unicodeData cannot be turned off while having unicode flag";
|
|
end;
|
|
local i, len = 1, codes.n;
|
|
local group_n = 0;
|
|
local outln, group_id, verb_flags = { }, { }, {
|
|
newline = 1, newline_seq = 1, not_empty = 0,
|
|
};
|
|
while i <= len do
|
|
local c = codes[i];
|
|
if c == 0x28 then
|
|
-- Match
|
|
local ret;
|
|
if codes[i + 1] == 0x2A then
|
|
i += 2;
|
|
local start_i = i;
|
|
while codes[i]
|
|
and (codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x5A
|
|
or codes[i] >= 0x61 and codes[i] <= 0x7A
|
|
or codes[i] == 0x5F or codes[i] == 0x3A) do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x29 and codes[i - 1] ~= 0x3A then
|
|
-- fallback as normal and ( can't be repeated
|
|
return "quantifier doesn't follow a repeatable pattern";
|
|
end;
|
|
local selected_verb = utf8_sub(codes.s, start_i, i);
|
|
if selected_verb == "positive_lookahead:" or selected_verb == "negative_lookhead:"
|
|
or selected_verb == "positive_lookbehind:" or selected_verb == "negative_lookbehind:"
|
|
or selected_verb:find("^[pn]l[ab]:$") then
|
|
ret = { 0x28, nil, nil, selected_verb:find('^n') and 0x21 or 0x3D, selected_verb:find('b', 3, true) and 1 };
|
|
elseif selected_verb == "atomic:" then
|
|
ret = { 0x28, nil, nil, 0x3E, nil };
|
|
elseif selected_verb == "ACCEPT" or selected_verb == "FAIL" or selected_verb == 'F' or selected_verb == "PRUNE" or selected_verb == "SKIP" then
|
|
ret = selected_verb == 'F' and "FAIL" or selected_verb;
|
|
else
|
|
if line_verbs[selected_verb] then
|
|
verb_flags.newline = selected_verb;
|
|
elseif selected_verb == "BSR_ANYCRLF" or selected_verb == "BSR_UNICODE" then
|
|
verb_flags.newline_seq = selected_verb == "BSR_UNICODE" and 1 or 0;
|
|
elseif selected_verb == "NOTEMPTY" or selected_verb == "NOTEMPTY_ATSTART" then
|
|
verb_flags.not_empty = selected_verb == "NOTEMPTY" and 1 or 2;
|
|
else
|
|
return "unknown or malformed verb";
|
|
end;
|
|
if outln[1] then
|
|
return "this verb must be placed at the beginning of the regex";
|
|
end;
|
|
end;
|
|
elseif codes[i + 1] == 0x3F then
|
|
-- ? syntax
|
|
i += 2;
|
|
if codes[i] == 0x23 then
|
|
-- comments
|
|
i = table.find(codes, 0x29, i);
|
|
if not i then
|
|
return "unterminated parenthetical";
|
|
end;
|
|
i += 1;
|
|
continue;
|
|
elseif not codes[i] then
|
|
return "unterminated parenthetical";
|
|
end;
|
|
ret = { 0x28, nil, nil, codes[i], nil };
|
|
if codes[i] == 0x30 and codes[i + 1] == 0x29 then
|
|
-- recursive match entire pattern
|
|
ret[1], ret[2], ret[3], ret[5] = "recurmatch", 0, 0, nil;
|
|
elseif codes[i] > 0x30 and codes[i] <= 0x39 then
|
|
-- recursive match
|
|
local org_i = i;
|
|
i += 1;
|
|
while codes[i] >= 0x30 and codes[i] <= 0x30 do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x29 then
|
|
return "invalid group structure";
|
|
end;
|
|
ret[1], ret[2], ret[4] = "recurmatch", tonumber(utf8_sub(codes.s, org_i, i)), nil;
|
|
elseif codes[i] == 0x3C and codes[i + 1] == 0x21 or codes[i + 1] == 0x3D then
|
|
-- lookbehinds
|
|
i += 1;
|
|
ret[4], ret[5] = codes[i], 1;
|
|
elseif codes[i] == 0x7C then
|
|
-- branch reset
|
|
ret[5] = group_n;
|
|
elseif codes[i] == 0x50 or codes[i] == 0x3C or codes[i] == 0x27 then
|
|
if codes[i] == 0x50 then
|
|
i += 1;
|
|
end;
|
|
if codes[i] == 0x3D then
|
|
-- backref
|
|
local start_i = i + 1;
|
|
while codes[i] and
|
|
(codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x5A
|
|
or codes[i] >= 0x61 and codes[i] <= 0x7A
|
|
or codes[i] == 0x5F) do
|
|
i += 1;
|
|
end;
|
|
if not codes[i] then
|
|
return "unterminated parenthetical";
|
|
elseif codes[i] ~= 0x29 or i == start_i then
|
|
return "invalid group structure";
|
|
end;
|
|
ret = { "backref", utf8_sub(codes.s, start_i, i) };
|
|
elseif codes[i] == 0x3C or codes[i - 1] ~= 0x50 and codes[i] == 0x27 then
|
|
-- named capture
|
|
local delimiter = codes[i] == 0x27 and 0x27 or 0x3E;
|
|
local start_i = i + 1;
|
|
i += 1;
|
|
if codes[i] == 0x29 then
|
|
return "missing character in subpattern";
|
|
elseif codes[i] >= 0x30 and codes[i] <= 0x39 then
|
|
return "subpattern name must not begin with a digit";
|
|
elseif not (codes[i] >= 0x41 and codes[i] <= 0x5A or codes[i] >= 0x61 and codes[i] <= 0x7A or codes[i] == 0x5F) then
|
|
return "invalid character in subpattern";
|
|
end;
|
|
i += 1;
|
|
while codes[i] and
|
|
(codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x5A
|
|
or codes[i] >= 0x61 and codes[i] <= 0x7A
|
|
or codes[i] == 0x5F) do
|
|
i += 1;
|
|
end;
|
|
if not codes[i] then
|
|
return "unterminated parenthetical";
|
|
elseif codes[i] ~= delimiter then
|
|
return "invalid character in subpattern";
|
|
end;
|
|
local name = utf8_sub(codes.s, start_i, i);
|
|
group_n += 1;
|
|
if (group_id[name] or group_n) ~= group_n then
|
|
return "subpattern name already exists";
|
|
end;
|
|
for name1, group_n1 in pairs(group_id) do
|
|
if name ~= name1 and group_n == group_n1 then
|
|
return "different names for subpatterns of the same number aren't permitted";
|
|
end;
|
|
end;
|
|
group_id[name] = group_n;
|
|
ret[2], ret[4] = group_n, nil;
|
|
else
|
|
return "invalid group structure";
|
|
end;
|
|
elseif not other_valid_group_char[codes[i]] then
|
|
return "invalid group structure";
|
|
end;
|
|
else
|
|
group_n += 1;
|
|
ret = { 0x28, group_n, nil, nil };
|
|
end;
|
|
if ret then
|
|
table.insert(outln, ret);
|
|
end;
|
|
elseif c == 0x29 then
|
|
-- Close parenthesis
|
|
local i1 = #outln + 1;
|
|
local lookbehind_c = -1;
|
|
local current_lookbehind_c = 0;
|
|
local max_c, group_c = 0, 0;
|
|
repeat
|
|
i1 -= 1;
|
|
local v, is_table = outln[i1], type(outln[i1]) == "table";
|
|
if is_table and v[1] == 0x28 then
|
|
group_c += 1;
|
|
if current_lookbehind_c and v.count then
|
|
current_lookbehind_c += v.count;
|
|
end;
|
|
if not v[3] then
|
|
if v[4] == 0x7C then
|
|
group_n = v[5] + math.max(max_c, group_c);
|
|
end;
|
|
if current_lookbehind_c ~= lookbehind_c and lookbehind_c ~= -1 then
|
|
lookbehind_c = nil;
|
|
else
|
|
lookbehind_c = current_lookbehind_c;
|
|
end;
|
|
break;
|
|
end;
|
|
elseif v == alternation then
|
|
if current_lookbehind_c ~= lookbehind_c and lookbehind_c ~= -1 then
|
|
lookbehind_c, current_lookbehind_c = nil, nil;
|
|
else
|
|
lookbehind_c, current_lookbehind_c = current_lookbehind_c, 0;
|
|
end;
|
|
max_c, group_c = math.max(max_c, group_c), 0;
|
|
elseif current_lookbehind_c then
|
|
if is_table and v[1] == "quantifier" then
|
|
if v[2] == v[3] then
|
|
current_lookbehind_c += v[2];
|
|
else
|
|
current_lookbehind_c = nil;
|
|
end;
|
|
else
|
|
current_lookbehind_c += 1;
|
|
end;
|
|
end;
|
|
until i1 < 1;
|
|
if i1 < 1 then
|
|
return "unmatched ) in regular expression";
|
|
end;
|
|
local v = outln[i1];
|
|
local outln_len_p_1 = #outln + 1;
|
|
local ret = { 0x29, v[2], i1, v[4], v[5], count = lookbehind_c };
|
|
if (v[4] == 0x21 or v[4] == 0x3D) and v[5] and not lookbehind_c then
|
|
return "lookbehind assertion is not fixed width";
|
|
end;
|
|
v[3] = outln_len_p_1;
|
|
table.insert(outln, ret);
|
|
elseif c == 0x2E then
|
|
table.insert(outln, dot);
|
|
elseif c == 0x5B then
|
|
-- Character set
|
|
local negate, char_class = false, nil;
|
|
i += 1;
|
|
local start_i = i;
|
|
if codes[i] == 0x5E then
|
|
negate = true;
|
|
i += 1;
|
|
elseif codes[i] == 0x2E or codes[i] == 0x3A or codes[i] == 0x3D then
|
|
-- POSIX character classes
|
|
char_class = codes[i];
|
|
end;
|
|
local ret;
|
|
if codes[i] == 0x5B or codes[i] == 0x5C then
|
|
ret = { };
|
|
else
|
|
ret = { codes[i] };
|
|
i += 1;
|
|
end;
|
|
while codes[i] ~= 0x5D do
|
|
if not codes[i] then
|
|
return "unterminated character class";
|
|
elseif codes[i] == 0x2D and ret[1] and type(ret[1]) == "number" then
|
|
if codes[i + 1] == 0x5D then
|
|
table.insert(ret, 1, 0x2D);
|
|
else
|
|
i += 1;
|
|
local ret_c = codes[i];
|
|
if ret_c == 0x5B then
|
|
if codes[i + 1] == 0x2E or codes[i + 1] == 0x3A or codes[i + 1] == 0x3D then
|
|
-- Check for POSIX character class, name does not matter
|
|
local i1 = i + 2;
|
|
repeat
|
|
i1 = table.find(codes, 0x5D, i1);
|
|
until not i1 or codes[i1 - 1] ~= 0x5C;
|
|
if not i1 then
|
|
return "unterminated character class";
|
|
elseif codes[i1 - 1] == codes[i + 1] and i1 - 1 ~= i + 1 then
|
|
return "invalid range in character class";
|
|
end;
|
|
end;
|
|
if ret[1] > 0x5B then
|
|
return "invalid range in character class";
|
|
end;
|
|
elseif ret_c == 0x5C then
|
|
i += 1;
|
|
if codes[i] == 0x78 then
|
|
local radix0, radix1;
|
|
i += 1;
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
|
|
radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
|
|
i += 1;
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
|
|
radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
|
|
else
|
|
i -= 1;
|
|
end;
|
|
else
|
|
i -= 1;
|
|
end;
|
|
ret_c = radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0;
|
|
elseif codes[i] >= 0x30 and codes[i] <= 0x37 then
|
|
local radix0, radix1, radix2 = codes[i] - 0x30, nil, nil;
|
|
i += 1;
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
|
|
radix1 = codes[i] - 0x30;
|
|
i += 1;
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
|
|
radix2 = codes[i] - 0x30;
|
|
else
|
|
i -= 1;
|
|
end;
|
|
else
|
|
i -= 1;
|
|
end;
|
|
ret_c = radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0;
|
|
else
|
|
ret_c = escape_chars[codes[i]] or codes[i];
|
|
if type(ret_c) ~= "number" then
|
|
return "invalid range in character class";
|
|
end;
|
|
end;
|
|
elseif ret[1] > ret_c then
|
|
return "invalid range in character class";
|
|
end;
|
|
ret[1] = { "range", ret[1], ret_c };
|
|
end;
|
|
elseif codes[i] == 0x5B then
|
|
if codes[i + 1] == 0x2E or codes[i + 1] == 0x3A or codes[i + 1] == 0x3D then
|
|
local i1 = i + 2;
|
|
repeat
|
|
i1 = table.find(codes, 0x5D, i1);
|
|
until not i1 or codes[i1 - 1] ~= 0x5C;
|
|
if not i1 then
|
|
return "unterminated character class";
|
|
elseif codes[i1 - 1] ~= codes[i + 1] or i1 - 1 == i + 1 then
|
|
table.insert(ret, 1, 0x5B);
|
|
elseif codes[i1 - 1] == 0x2E or codes[i1 - 1] == 0x3D then
|
|
return "POSIX collating elements aren't supported";
|
|
elseif codes[i1 - 1] == 0x3A then
|
|
-- I have no plans to support escape codes (\) in character class names
|
|
local negate = codes[i + 3] == 0x5E;
|
|
local class_name = utf8_sub(codes.s, i + (negate and 3 or 2), i1 - 1);
|
|
-- If not valid then throw an error
|
|
if not posix_class_names[class_name] then
|
|
return "unknown POSIX class name";
|
|
end;
|
|
table.insert(ret, 1, { "class", class_name, negate });
|
|
i = i1;
|
|
end;
|
|
else
|
|
table.insert(ret, 1, 0x5B);
|
|
end;
|
|
elseif codes[i] == 0x5C then
|
|
i += 1;
|
|
if codes[i] == 0x78 then
|
|
local radix0, radix1;
|
|
i += 1;
|
|
if codes[i] == 0x7B then
|
|
i += 1;
|
|
local org_i = i;
|
|
while codes[i] and
|
|
(codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x46
|
|
or codes[i] >= 0x61 and codes[i] <= 0x66) do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x7D or i == org_i then
|
|
return "malformed hexadecimal character";
|
|
elseif i - org_i > 4 then
|
|
return "character offset too large";
|
|
end;
|
|
table.insert(ret, 1, tonumber(utf8_sub(codes.s, org_i, i), 16));
|
|
else
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
|
|
radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
|
|
i += 1;
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66 then
|
|
radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
|
|
else
|
|
i -= 1;
|
|
end;
|
|
else
|
|
i -= 1;
|
|
end;
|
|
table.insert(ret, 1, radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0);
|
|
end;
|
|
elseif codes[i] >= 0x30 and codes[i] <= 0x37 then
|
|
local radix0, radix1, radix2 = codes[i] - 0x30, nil, nil;
|
|
i += 1;
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
|
|
radix1 = codes[i] - 0x30;
|
|
i += 1;
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
|
|
radix2 = codes[i] - 0x30;
|
|
else
|
|
i -= 1;
|
|
end;
|
|
else
|
|
i -= 1;
|
|
end;
|
|
table.insert(ret, 1, radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0);
|
|
elseif codes[i] == 0x45 then
|
|
-- intentionally left blank, \E that's not preceded \Q is ignored
|
|
elseif codes[i] == 0x51 then
|
|
local start_i = i + 1;
|
|
repeat
|
|
i = table.find(codes, 0x5C, i + 1);
|
|
until not i or codes[i + 1] == 0x45;
|
|
table.move(codes, start_i, i and i - 1 or #codes, #outln + 1, outln);
|
|
if not i then
|
|
break;
|
|
end;
|
|
i += 1;
|
|
elseif codes[i] == 0x4E then
|
|
if codes[i + 1] == 0x7B and codes[i + 2] == 0x55 and codes[i + 3] == 0x2B and flags.unicode then
|
|
i += 4;
|
|
local start_i = i;
|
|
while codes[i] and
|
|
(codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x46
|
|
or codes[i] >= 0x61 and codes[i] <= 0x66) do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x7D or i == start_i then
|
|
return "malformed Unicode code point";
|
|
end;
|
|
local code_point = tonumber(utf8_sub(codes.s, start_i, i));
|
|
table.insert(ret, 1, code_point);
|
|
else
|
|
return "invalid escape sequence";
|
|
end;
|
|
elseif codes[i] == 0x50 or codes[i] == 0x70 then
|
|
if not options.unicodeData then
|
|
return "options.unicodeData cannot be turned off when using \\p";
|
|
end;
|
|
i += 1;
|
|
if codes[i] ~= 0x7B then
|
|
local c_name = utf8.char(codes[i] or 0);
|
|
if not valid_categories[c_name] then
|
|
return "unknown or malformed script name";
|
|
end;
|
|
table.insert(ret, 1, { "category", false, c_name });
|
|
else
|
|
local negate = codes[i] == 0x50;
|
|
i += 1;
|
|
if codes[i] == 0x5E then
|
|
i += 1;
|
|
negate = not negate;
|
|
end;
|
|
local start_i = i;
|
|
while codes[i] and
|
|
(codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x5A
|
|
or codes[i] >= 0x61 and codes[i] <= 0x7A
|
|
or codes[i] == 0x5F) do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x7D then
|
|
return "unknown or malformed script name";
|
|
end;
|
|
local c_name = utf8_sub(codes.s, start_i, i);
|
|
local script_set = chr_scripts[c_name];
|
|
if script_set then
|
|
table.insert(ret, 1, { "charset", negate, script_set });
|
|
elseif not valid_categories[c_name] then
|
|
return "unknown or malformed script name";
|
|
else
|
|
table.insert(ret, 1, { "category", negate, c_name });
|
|
end;
|
|
end;
|
|
elseif codes[i] == 0x6F then
|
|
i += 1;
|
|
if codes[i] ~= 0x7B then
|
|
return "malformed octal code";
|
|
end;
|
|
i += 1;
|
|
local org_i = i;
|
|
while codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x7D or i == org_i then
|
|
return "malformed octal code";
|
|
end;
|
|
local ret_chr = tonumber(utf8_sub(codes.s, org_i, i), 8);
|
|
if ret_chr > 0xFFFF then
|
|
return "character offset too large";
|
|
end;
|
|
table.insert(ret, 1, ret_chr);
|
|
else
|
|
local esc_char = escape_chars[codes[i]];
|
|
table.insert(ret, 1, type(esc_char) == "string" and { "class", esc_char, false } or esc_char or codes[i]);
|
|
end;
|
|
elseif flags.ignoreCase and codes[i] >= 0x61 and codes[i] <= 0x7A then
|
|
table.insert(ret, 1, codes[i] - 0x20);
|
|
else
|
|
table.insert(ret, 1, codes[i]);
|
|
end;
|
|
i += 1;
|
|
end;
|
|
if codes[i - 1] == char_class and i - 1 ~= start_i then
|
|
return char_class == 0x3A and "POSIX named classes are only support within a character set" or "POSIX collating elements aren't supported";
|
|
end;
|
|
if not ret[2] and not negate then
|
|
table.insert(outln, ret[1]);
|
|
else
|
|
table.insert(outln, { "charset", negate, ret });
|
|
end;
|
|
elseif c == 0x5C then
|
|
-- Escape char
|
|
i += 1;
|
|
local escape_c = codes[i];
|
|
if not escape_c then
|
|
return "pattern may not end with a trailing backslash";
|
|
elseif escape_c >= 0x30 and escape_c <= 0x39 then
|
|
local org_i = i;
|
|
while codes[i + 1] and codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39 do
|
|
i += 1;
|
|
end;
|
|
local escape_d = tonumber(utf8_sub(codes.s, org_i, i + 1));
|
|
if escape_d > group_n and i ~= org_i then
|
|
i = org_i;
|
|
local radix0, radix1, radix2;
|
|
if codes[i] <= 0x37 then
|
|
radix0 = codes[i] - 0x30;
|
|
i += 1;
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
|
|
radix1 = codes[i] - 0x30;
|
|
i += 1;
|
|
if codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 then
|
|
radix2 = codes[i] - 0x30;
|
|
else
|
|
i -= 1;
|
|
end;
|
|
else
|
|
i -= 1;
|
|
end;
|
|
end;
|
|
table.insert(outln, radix0 and (radix1 and (radix2 and 64 * radix0 + 8 * radix1 + radix2 or 8 * radix0 + radix1) or radix0) or codes[org_i]);
|
|
else
|
|
table.insert(outln, { "backref", escape_d });
|
|
end;
|
|
elseif escape_c == 0x45 then
|
|
-- intentionally left blank, \E that's not preceded \Q is ignored
|
|
elseif escape_c == 0x51 then
|
|
local start_i = i + 1;
|
|
repeat
|
|
i = table.find(codes, 0x5C, i + 1);
|
|
until not i or codes[i + 1] == 0x45;
|
|
table.move(codes, start_i, i and i - 1 or #codes, #outln + 1, outln);
|
|
if not i then
|
|
break;
|
|
end;
|
|
i += 1;
|
|
elseif escape_c == 0x4E then
|
|
if codes[i + 1] == 0x7B and codes[i + 2] == 0x55 and codes[i + 3] == 0x2B and flags.unicode then
|
|
i += 4;
|
|
local start_i = i;
|
|
while codes[i] and
|
|
(codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x46
|
|
or codes[i] >= 0x61 and codes[i] <= 0x66) do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x7D or i == start_i then
|
|
return "malformed Unicode code point";
|
|
end;
|
|
local code_point = tonumber(utf8_sub(codes.s, start_i, i));
|
|
table.insert(outln, code_point);
|
|
else
|
|
table.insert(outln, escape_chars[0x4E]);
|
|
end;
|
|
elseif escape_c == 0x50 or escape_c == 0x70 then
|
|
if not options.unicodeData then
|
|
return "options.unicodeData cannot be turned off when using \\p";
|
|
end;
|
|
i += 1;
|
|
if codes[i] ~= 0x7B then
|
|
local c_name = utf8.char(codes[i] or 0);
|
|
if not valid_categories[c_name] then
|
|
return "unknown or malformed script name";
|
|
end;
|
|
table.insert(outln, { "category", false, c_name });
|
|
else
|
|
local negate = escape_c == 0x50;
|
|
i += 1;
|
|
if codes[i] == 0x5E then
|
|
i += 1;
|
|
negate = not negate;
|
|
end;
|
|
local start_i = i;
|
|
while codes[i] and
|
|
(codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x5A
|
|
or codes[i] >= 0x61 and codes[i] <= 0x7A
|
|
or codes[i] == 0x5F) do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x7D then
|
|
return "unknown or malformed script name";
|
|
end;
|
|
local c_name = utf8_sub(codes.s, start_i, i);
|
|
local script_set = chr_scripts[c_name];
|
|
if script_set then
|
|
table.insert(outln, { "charset", negate, script_set });
|
|
elseif not valid_categories[c_name] then
|
|
return "unknown or malformed script name";
|
|
else
|
|
table.insert(outln, { "category", negate, c_name });
|
|
end;
|
|
end;
|
|
elseif escape_c == 0x67 and (codes[i + 1] == 0x7B or codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39) then
|
|
local is_grouped = false;
|
|
i += 1;
|
|
if codes[i] == 0x7B then
|
|
i += 1;
|
|
is_grouped = true;
|
|
elseif codes[i] < 0x30 or codes[i] > 0x39 then
|
|
return "malformed reference code";
|
|
end;
|
|
local org_i = i;
|
|
while codes[i] and
|
|
(codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x46
|
|
or codes[i] >= 0x61 and codes[i] <= 0x66) do
|
|
i += 1;
|
|
end;
|
|
if is_grouped and codes[i] ~= 0x7D then
|
|
return "malformed reference code";
|
|
end;
|
|
local ref_name = tonumber(utf8_sub(codes.s, org_i, i + (is_grouped and 0 or 1)));
|
|
table.insert(outln, { "backref", ref_name });
|
|
if not is_grouped then
|
|
i -= 1;
|
|
end;
|
|
elseif escape_c == 0x6F then
|
|
i += 1;
|
|
if codes[i + 1] ~= 0x7B then
|
|
return "malformed octal code";
|
|
end
|
|
i += 1;
|
|
local org_i = i;
|
|
while codes[i] and codes[i] >= 0x30 and codes[i] <= 0x37 do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x7D or i == org_i then
|
|
return "malformed octal code";
|
|
end;
|
|
local ret_chr = tonumber(utf8_sub(codes.s, org_i, i), 8);
|
|
if ret_chr > 0xFFFF then
|
|
return "character offset too large";
|
|
end;
|
|
table.insert(outln, ret_chr);
|
|
elseif escape_c == 0x78 then
|
|
local radix0, radix1;
|
|
i += 1;
|
|
if codes[i] == 0x7B then
|
|
i += 1;
|
|
local org_i = i;
|
|
while codes[i] and
|
|
(codes[i] >= 0x30 and codes[i] <= 0x39
|
|
or codes[i] >= 0x41 and codes[i] <= 0x46
|
|
or codes[i] >= 0x61 and codes[i] <= 0x66) do
|
|
i += 1;
|
|
end;
|
|
if codes[i] ~= 0x7D or i == org_i then
|
|
return "malformed hexadecimal code";
|
|
elseif i - org_i > 4 then
|
|
return "character offset too large";
|
|
end;
|
|
table.insert(outln, tonumber(utf8_sub(codes.s, org_i, i), 16));
|
|
else
|
|
if codes[i] and (codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66) then
|
|
radix0 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
|
|
i += 1;
|
|
if codes[i] and (codes[i] >= 0x30 and codes[i] <= 0x39 or codes[i] >= 0x41 and codes[i] <= 0x46 or codes[i] >= 0x61 and codes[i] <= 0x66) then
|
|
radix1 = codes[i] - ((codes[i] >= 0x41 and codes[i] <= 0x5A) and 0x37 or (codes[i] >= 0x61 and codes[i] <= 0x7A) and 0x57 or 0x30);
|
|
else
|
|
i -= 1;
|
|
end;
|
|
else
|
|
i -= 1;
|
|
end;
|
|
table.insert(outln, radix0 and (radix1 and 16 * radix0 + radix1 or radix0) or 0);
|
|
end;
|
|
else
|
|
local esc_char = b_escape_chars[escape_c] or escape_chars[escape_c];
|
|
table.insert(outln, esc_char or escape_c);
|
|
end;
|
|
elseif c == 0x2A or c == 0x2B or c == 0x3F or c == 0x7B then
|
|
-- Quantifier
|
|
local start_q, end_q;
|
|
if c == 0x7B then
|
|
local org_i = i + 1;
|
|
local start_i;
|
|
while codes[i + 1] and (codes[i + 1] >= 0x30 and codes[i + 1] <= 0x39 or codes[i + 1] == 0x2C and not start_i and i + 1 ~= org_i) do
|
|
i += 1;
|
|
if codes[i] == 0x2C then
|
|
start_i = i;
|
|
end;
|
|
end;
|
|
if codes[i + 1] == 0x7D then
|
|
i += 1;
|
|
if not start_i then
|
|
start_q = tonumber(utf8_sub(codes.s, org_i, i));
|
|
end_q = start_q;
|
|
else
|
|
start_q, end_q = tonumber(utf8_sub(codes.s, org_i, start_i)), start_i + 1 == i and math.huge or tonumber(utf8_sub(codes.s, start_i + 1, i));
|
|
if end_q < start_q then
|
|
return "numbers out of order in {} quantifier";
|
|
end;
|
|
end;
|
|
else
|
|
table.move(codes, org_i - 1, i, #outln + 1, outln);
|
|
end;
|
|
else
|
|
start_q, end_q = c == 0x2B and 1 or 0, c == 0x3F and 1 or math.huge;
|
|
end;
|
|
if start_q then
|
|
local quantifier_type = flags.ungreedy and "lazy" or "greedy";
|
|
if codes[i + 1] == 0x2B or codes[i + 1] == 0x3F then
|
|
i += 1;
|
|
quantifier_type = codes[i] == 0x2B and "possessive" or flags.ungreedy and "greedy" or "lazy";
|
|
end;
|
|
local outln_len = #outln;
|
|
local last_outln_value = outln[outln_len];
|
|
if not last_outln_value or type(last_outln_value) == "table" and (last_outln_value[1] == "quantifier" or last_outln_value[1] == 0x28 or b_escape_chars[last_outln_value[1]])
|
|
or last_outln_value == alternation or type(last_outln_value) == "string" then
|
|
return "quantifier doesn't follow a repeatable pattern";
|
|
end;
|
|
if end_q == 0 then
|
|
table.remove(outln);
|
|
elseif start_q ~= 1 or end_q ~= 1 then
|
|
if type(last_outln_value) == "table" and last_outln_value[1] == 0x29 then
|
|
outln_len = last_outln_value[3];
|
|
end;
|
|
outln[outln_len] = { "quantifier", start_q, end_q, quantifier_type, outln[outln_len] };
|
|
end;
|
|
end;
|
|
elseif c == 0x7C then
|
|
-- Alternation
|
|
table.insert(outln, alternation);
|
|
local i1 = #outln;
|
|
repeat
|
|
i1 -= 1;
|
|
local v1, is_table = outln[i1], type(outln[i1]) == "table";
|
|
if is_table and v1[1] == 0x29 then
|
|
i1 = outln[i1][3];
|
|
elseif is_table and v1[1] == 0x28 then
|
|
if v1[4] == 0x7C then
|
|
group_n = v1[5];
|
|
end;
|
|
break;
|
|
end;
|
|
until not v1;
|
|
elseif c == 0x24 or c == 0x5E then
|
|
table.insert(outln, c == 0x5E and beginning_str or end_str);
|
|
elseif flags.ignoreCase and c >= 0x61 and c <= 0x7A then
|
|
table.insert(outln, c - 0x20);
|
|
elseif flags.extended and (c >= 0x09 and c <= 0x0D or c == 0x20 or c == 0x23) then
|
|
if c == 0x23 then
|
|
repeat
|
|
i += 1;
|
|
until not codes[i] or codes[i] == 0x0A or codes[i] == 0x0D;
|
|
end;
|
|
else
|
|
table.insert(outln, c);
|
|
end;
|
|
i += 1;
|
|
end;
|
|
local max_group_n = 0;
|
|
for i, v in ipairs(outln) do
|
|
if type(v) == "table" and (v[1] == 0x28 or v[1] == "quantifier" and type(v[5]) == "table" and v[5][1] == 0x28) then
|
|
if v[1] == "quantifier" then
|
|
v = v[5];
|
|
end;
|
|
if not v[3] then
|
|
return "unterminated parenthetical";
|
|
elseif v[2] then
|
|
max_group_n = math.max(max_group_n, v[2]);
|
|
end;
|
|
elseif type(v) == "table" and (v[1] == "backref" or v[1] == "recurmatch") then
|
|
if not group_id[v[2]] and (type(v[2]) ~= "number" or v[2] > group_n) then
|
|
return "reference to a non-existent or invalid subpattern";
|
|
elseif v[1] == "recurmatch" and v[2] ~= 0 then
|
|
for i1, v1 in ipairs(outln) do
|
|
if type(v1) == "table" and v1[1] == 0x28 and v1[2] == v[2] then
|
|
v[3] = i1;
|
|
break;
|
|
end;
|
|
end;
|
|
elseif type(v[2]) == "string" then
|
|
v[2] = group_id[v[2]];
|
|
end;
|
|
end;
|
|
end;
|
|
outln.group_n = max_group_n;
|
|
return outln, group_id, verb_flags;
|
|
end;
|
|
|
|
if not tonumber(options.cacheSize) then
|
|
error(string.format("expected number for options.cacheSize, got %s", typeof(options.cacheSize)), 2);
|
|
end;
|
|
local cacheSize = math.floor(options.cacheSize or 0) ~= 0 and tonumber(options.cacheSize);
|
|
local cache_pattern, cache_pattern_names;
|
|
if not cacheSize then
|
|
elseif cacheSize < 0 or cacheSize ~= cacheSize then
|
|
error("cache size cannot be a negative number or a NaN", 2);
|
|
elseif cacheSize == math.huge then
|
|
cache_pattern, cache_pattern_names = { nil }, { nil };
|
|
elseif cacheSize >= 2 ^ 32 then
|
|
error("cache size too large", 2);
|
|
else
|
|
cache_pattern, cache_pattern_names = table.create(options.cacheSize), table.create(options.cacheSize);
|
|
end;
|
|
if cacheSize then
|
|
function re.pruge()
|
|
table.clear(cache_pattern_names);
|
|
table.clear(cache_pattern);
|
|
end;
|
|
end;
|
|
|
|
local function new_re(str_arr, flags, flag_repr, pattern_repr)
|
|
local tokenized_ptn, group_id, verb_flags;
|
|
local cache_format = cacheSize and string.format("%s|%s", str_arr.s, flag_repr);
|
|
local cached_token = cacheSize and cache_pattern[table.find(cache_pattern_names, cache_format)];
|
|
if cached_token then
|
|
tokenized_ptn, group_id, verb_flags = table.unpack(cached_token, 1, 3);
|
|
else
|
|
tokenized_ptn, group_id, verb_flags = tokenize_ptn(str_arr, flags);
|
|
if type(tokenized_ptn) == "string" then
|
|
error(tokenized_ptn, 2);
|
|
end;
|
|
if cacheSize and tokenized_ptn[1] then
|
|
table.insert(cache_pattern_names, 1, cache_format);
|
|
table.insert(cache_pattern, 1, { tokenized_ptn, group_id, verb_flags });
|
|
if cacheSize ~= math.huge then
|
|
table.remove(cache_pattern_names, cacheSize + 1);
|
|
table.remove(cache_pattern, cacheSize + 1);
|
|
end;
|
|
end;
|
|
end;
|
|
|
|
local object = newproxy(true);
|
|
proxy[object] = { name = "RegEx", flags = flags, flag_repr = flag_repr, pattern_repr = pattern_repr, token = tokenized_ptn, group_id = group_id, verb_flags = verb_flags };
|
|
local object_mt = getmetatable(object);
|
|
object_mt.__index = setmetatable(flags, re_m);
|
|
object_mt.__tostring = re_tostr;
|
|
object_mt.__metatable = lockmsg;
|
|
|
|
return object;
|
|
end;
|
|
|
|
local function escape_fslash(pre)
|
|
return (#pre % 2 == 0 and '\\' or '') .. pre .. '.';
|
|
end;
|
|
|
|
local function sort_flag_chr(a, b)
|
|
return a:lower() < b:lower();
|
|
end;
|
|
|
|
function re.new(...)
|
|
if select('#', ...) == 0 then
|
|
error("missing argument #1 (string expected)", 2);
|
|
end;
|
|
local ptn, flags_str = ...;
|
|
if type(ptn) == "number" then
|
|
ptn ..= '';
|
|
elseif type(ptn) ~= "string" then
|
|
error(string.format("invalid argument #1 (string expected, got %s)", typeof(ptn)), 2);
|
|
end;
|
|
if type(flags_str) ~= "string" and type(flags_str) ~= "number" and flags_str ~= nil then
|
|
error(string.format("invalid argument #2 (string expected, got %s)", typeof(flags_str)), 2);
|
|
end;
|
|
|
|
local flags = {
|
|
anchored = false, caseless = false, multiline = false, dotall = false, unicode = false, ungreedy = false, extended = false,
|
|
};
|
|
local flag_repr = { };
|
|
for f in string.gmatch(flags_str or '', utf8.charpattern) do
|
|
if flags[flag_map[f]] ~= false then
|
|
error("invalid regular expression flag " .. f, 3);
|
|
end;
|
|
flags[flag_map[f]] = true;
|
|
table.insert(flag_repr, f);
|
|
end;
|
|
table.sort(flag_repr, sort_flag_chr);
|
|
flag_repr = table.concat(flag_repr);
|
|
return new_re(to_str_arr(ptn), flags, flag_repr, string.format("/%s/", ptn:gsub("(\\*)/", escape_fslash)));
|
|
end;
|
|
|
|
function re.fromstring(...)
|
|
if select('#', ...) == 0 then
|
|
error("missing argument #1 (string expected)", 2);
|
|
end;
|
|
local ptn = ...;
|
|
if type(ptn) == "number" then
|
|
ptn ..= '';
|
|
elseif type(ptn) ~= "string" then
|
|
error(string.format("invalid argument #1 (string expected, got %s)", typeof(ptn), 2));
|
|
end;
|
|
local str_arr = to_str_arr(ptn);
|
|
local delimiter = str_arr[1];
|
|
if not delimiter then
|
|
error("empty regex", 2);
|
|
elseif delimiter == 0x5C or (delimiter >= 0x30 and delimiter <= 0x39) or (delimiter >= 0x41 and delimiter <= 0x5A) or (delimiter >= 0x61 and delimiter <= 0x7A) then
|
|
error("delimiter must not be alphanumeric or a backslash", 2);
|
|
end;
|
|
|
|
local i0 = 1;
|
|
repeat
|
|
i0 = table.find(str_arr, delimiter, i0 + 1);
|
|
if not i0 then
|
|
error(string.format("no ending delimiter ('%s') found", utf8.char(delimiter)), 2);
|
|
end;
|
|
local escape_count = 1;
|
|
while str_arr[i0 - escape_count] == 0x5C do
|
|
escape_count += 1;
|
|
end;
|
|
until escape_count % 2 == 1;
|
|
|
|
local flags = {
|
|
anchored = false, caseless = false, multiline = false, dotall = false, unicode = false, ungreedy = false, extended = false,
|
|
};
|
|
local flag_repr = { };
|
|
while str_arr.n > i0 do
|
|
local f = utf8.char(table.remove(str_arr));
|
|
str_arr.n -= 1;
|
|
if flags[flag_map[f]] ~= false then
|
|
error("invalid regular expression flag " .. f, 3);
|
|
end;
|
|
flags[flag_map[f]] = true;
|
|
table.insert(flag_repr, f);
|
|
end;
|
|
table.sort(flag_repr, sort_flag_chr);
|
|
flag_repr = table.concat(flag_repr);
|
|
table.remove(str_arr, 1);
|
|
table.remove(str_arr);
|
|
str_arr.n -= 2;
|
|
str_arr.s = string.sub(str_arr.s, 2, 1 + str_arr.n);
|
|
return new_re(str_arr, flags, flag_repr, string.sub(ptn, 1, 2 + str_arr.n));
|
|
end;
|
|
|
|
local re_escape_line_chrs = {
|
|
['\0'] = '\\x00', ['\n'] = '\\n', ['\t'] = '\\t', ['\r'] = '\\r', ['\f'] = '\\f',
|
|
};
|
|
|
|
function re.escape(...)
|
|
if select('#', ...) == 0 then
|
|
error("missing argument #1 (string expected)", 2);
|
|
end;
|
|
local str, extended, delimiter = ...;
|
|
if type(str) == "number" then
|
|
str ..= '';
|
|
elseif type(str) ~= "string" then
|
|
error(string.format("invalid argument #1 to 'escape' (string expected, got %s)", typeof(str)), 2);
|
|
end;
|
|
if delimiter == nil then
|
|
delimiter = '';
|
|
elseif type(delimiter) == "number" then
|
|
delimiter ..= '';
|
|
elseif type(delimiter) ~= "string" then
|
|
error(string.format("invalid argument #3 to 'escape' (string expected, got %s)", typeof(delimiter)), 2);
|
|
end;
|
|
if utf8.len(delimiter) > 1 or delimiter:match("^[%a\\]$") then
|
|
error("delimiter have not be alphanumeric", 2);
|
|
end;
|
|
return (string.gsub(str, "[\0\f\n\r\t]", re_escape_line_chrs):gsub(string.format("[\\%s#()%%%%*+.?[%%]^{|%s]", extended and '%s' or '', (delimiter:find'^[%%%]]$' and '%' or '') .. delimiter), "\\%1"));
|
|
end;
|
|
|
|
function re.type(...)
|
|
if select('#', ...) == 0 then
|
|
error("missing argument #1", 2);
|
|
end;
|
|
return proxy[...] and proxy[...].name;
|
|
end;
|
|
|
|
-- TODO: table.foreach is currently used as top-level loops needlessly increase native code size for this module
|
|
table.foreach(re_m, function(k, f) re[k] = f end)
|
|
|
|
re_m = { __index = re_m };
|
|
|
|
lockmsg = re.fromstring([[/The\s*metatable\s*is\s*(?:locked|inaccessible)(?#Nice try :])/i]]);
|
|
getmetatable(lockmsg).__metatable = lockmsg;
|
|
|
|
local function readonly_table()
|
|
error("Attempt to modify a readonly table", 2);
|
|
end;
|
|
|
|
match_m = {
|
|
__index = match_m,
|
|
__metatable = lockmsg,
|
|
__newindex = readonly_table,
|
|
};
|
|
|
|
re.Match = setmetatable({ }, match_m);
|
|
|
|
return setmetatable({ }, {
|
|
__index = re,
|
|
__metatable = lockmsg,
|
|
__newindex = readonly_table,
|
|
});
|