luau/VM/src/lutf8lib.cpp

// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
// This code is based on Lua 5.x implementation licensed under MIT License; see lua_LICENSE.txt for details
#include "lualib.h"

#include "lcommon.h"

#define MAXUNICODE 0x10FFFF

#define iscont(p) ((*(p)&0xC0) == 0x80)

/* from strlib */
/* translate a relative string position: negative means back from end */
static int u_posrelat(int pos, size_t len)
{
    if (pos >= 0)
        return pos;
    else if (0u - (size_t)pos > len)
        return 0;
    else
        return (int)len + pos + 1;
}

/*
** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
*/
static const char* utf8_decode(const char* o, int* val)
{
    static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
    const unsigned char* s = (const unsigned char*)o;
    unsigned int c = s[0];
    unsigned int res = 0; /* final result */
    if (c < 0x80)         /* ascii? */
        res = c;
    else
    {
        int count = 0; /* to count number of continuation bytes */
        while (c & 0x40)
        {                                   /* still have continuation bytes? */
            int cc = s[++count];            /* read next byte */
            if ((cc & 0xC0) != 0x80)        /* not a continuation byte? */
                return NULL;                /* invalid byte sequence */
            res = (res << 6) | (cc & 0x3F); /* add lower 6 bits from cont. byte */
            c <<= 1;                        /* to test next bit */
        }
        res |= ((c & 0x7F) << (count * 5)); /* add first byte */
        if (count > 3 || res > MAXUNICODE || res <= limits[count])
            return NULL; /* invalid byte sequence */
        s += count;      /* skip continuation bytes read */
    }
    if (val)
        *val = res;
    return (const char*)s + 1; /* +1 to include first byte */
}

/*
** utf8len(s [, i [, j]]) --> number of characters that start in the
** range [i,j], or nil + current position if 's' is not well formed in
** that interval
*/
static int utflen(lua_State* L)
{
    int n = 0;
    size_t len;
    const char* s = luaL_checklstring(L, 1, &len);
    int posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
    int posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
    luaL_argcheck(L, 1 <= posi && --posi <= (int)len, 2, "initial position out of string");
    luaL_argcheck(L, --posj < (int)len, 3, "final position out of string");
    while (posi <= posj)
    {
        const char* s1 = utf8_decode(s + posi, NULL);
        if (s1 == NULL)
        {                                 /* conversion error? */
            lua_pushnil(L);               /* return nil ... */
            lua_pushinteger(L, posi + 1); /* ... and current position */
            return 2;
        }
        posi = (int)(s1 - s);
        n++;
    }
    lua_pushinteger(L, n);
    return 1;
}

/*
** codepoint(s, [i, [j]])  -> returns codepoints for all characters
** that start in the range [i,j]
*/
static int codepoint(lua_State* L)
{
    size_t len;
    const char* s = luaL_checklstring(L, 1, &len);
    int posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
    int pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
    int n;
    const char* se;
    luaL_argcheck(L, posi >= 1, 2, "out of range");
    luaL_argcheck(L, pose <= (int)len, 3, "out of range");
    if (posi > pose)
        return 0;               /* empty interval; return no values */
    if (pose - posi >= INT_MAX) /* (int -> int) overflow? */
        luaL_error(L, "string slice too long");
    n = (int)(pose - posi) + 1;
    luaL_checkstack(L, n, "string slice too long");
    n = 0;
    se = s + pose;
    for (s += posi - 1; s < se;)
    {
        int code;
        s = utf8_decode(s, &code);
        if (s == NULL)
            luaL_error(L, "invalid UTF-8 code");
        lua_pushinteger(L, code);
        n++;
    }
    return n;
}

// from Lua 5.3 lobject.h
#define UTF8BUFFSZ 8

// from Lua 5.3 lobject.c, copied verbatim + static
static int luaO_utf8esc(char* buff, unsigned long x)
{
    int n = 1; /* number of bytes put in buffer (backwards) */
    LUAU_ASSERT(x <= 0x10FFFF);
    if (x < 0x80) /* ascii? */
        buff[UTF8BUFFSZ - 1] = cast_to(char, x);
    else
    {                            /* need continuation bytes */
        unsigned int mfb = 0x3f; /* maximum that fits in first byte */
        do
        { /* add continuation bytes */
            buff[UTF8BUFFSZ - (n++)] = cast_to(char, 0x80 | (x & 0x3f));
            x >>= 6;                                           /* remove added bits */
            mfb >>= 1;                                         /* now there is one less bit available in first byte */
        } while (x > mfb);                                     /* still needs continuation byte? */
        buff[UTF8BUFFSZ - n] = cast_to(char, (~mfb << 1) | x); /* add first byte */
    }
    return n;
}

// lighter replacement for pushutfchar; doesn't push any string onto the stack
static int buffutfchar(lua_State* L, int arg, char* buff, const char** charstr)
{
    int code = luaL_checkinteger(L, arg);
    luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
    int l = luaO_utf8esc(buff, cast_to(long, code));
    *charstr = buff + UTF8BUFFSZ - l;
    return l;
}

/*
** utfchar(n1, n2, ...)  -> char(n1)..char(n2)...
**
** This version avoids the need to make more invasive upgrades elsewhere (like
** implementing the %U escape in lua_pushfstring) and avoids pushing string
** objects for each codepoint in the multi-argument case. -Jovanni
*/
static int utfchar(lua_State* L)
{
    char buff[UTF8BUFFSZ];
    const char* charstr;

    int n = lua_gettop(L); /* number of arguments */
    if (n == 1)
    { /* optimize common case of single char */
        int l = buffutfchar(L, 1, buff, &charstr);
        lua_pushlstring(L, charstr, l);
    }
    else
    {
        luaL_Buffer b;
        luaL_buffinit(L, &b);
        for (int i = 1; i <= n; i++)
        {
            int l = buffutfchar(L, i, buff, &charstr);
            luaL_addlstring(&b, charstr, l);
        }
        luaL_pushresult(&b);
    }
    return 1;
}

/*
** offset(s, n, [i])  -> index where n-th character counting from
**   position 'i' starts; 0 means character at 'i'.
*/
static int byteoffset(lua_State* L)
{
    size_t len;
    const char* s = luaL_checklstring(L, 1, &len);
    int n = luaL_checkinteger(L, 2);
    int posi = (n >= 0) ? 1 : (int)len + 1;
    posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
    luaL_argcheck(L, 1 <= posi && --posi <= (int)len, 3, "position out of range");
    if (n == 0)
    {
        /* find beginning of current byte sequence */
        while (posi > 0 && iscont(s + posi))
            posi--;
    }
    else
    {
        if (iscont(s + posi))
            luaL_error(L, "initial position is a continuation byte");
        if (n < 0)
        {
            while (n < 0 && posi > 0)
            { /* move back */
                do
                { /* find beginning of previous character */
                    posi--;
                } while (posi > 0 && iscont(s + posi));
                n++;
            }
        }
        else
        {
            n--; /* do not move for 1st character */
            while (n > 0 && posi < (int)len)
            {
                do
                { /* find beginning of next character */
                    posi++;
                } while (iscont(s + posi)); /* (cannot pass final '\0') */
                n--;
            }
        }
    }
    if (n == 0) /* did it find given character? */
        lua_pushinteger(L, posi + 1);
    else /* no such character */
        lua_pushnil(L);
    return 1;
}

static int iter_aux(lua_State* L)
{
    size_t len;
    const char* s = luaL_checklstring(L, 1, &len);
    int n = lua_tointeger(L, 2) - 1;
    if (n < 0) /* first iteration? */
        n = 0; /* start from here */
    else if (n < (int)len)
    {
        n++; /* skip current byte */
        while (iscont(s + n))
            n++; /* and its continuations */
    }
    if (n >= (int)len)
        return 0; /* no more codepoints */
    else
    {
        int code;
        const char* next = utf8_decode(s + n, &code);
        if (next == NULL || iscont(next))
            luaL_error(L, "invalid UTF-8 code");
        lua_pushinteger(L, n + 1);
        lua_pushinteger(L, code);
        return 2;
    }
}

static int iter_codes(lua_State* L)
{
    luaL_checkstring(L, 1);
    lua_pushcfunction(L, iter_aux, NULL);
    lua_pushvalue(L, 1);
    lua_pushinteger(L, 0);
    return 3;
}

/* pattern to match a single UTF-8 character */
#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"

static const luaL_Reg funcs[] = {
    {"offset", byteoffset},
    {"codepoint", codepoint},
    {"char", utfchar},
    {"len", utflen},
    {"codes", iter_codes},
    {NULL, NULL},
};

LUALIB_API int luaopen_utf8(lua_State* L)
{
    luaL_register(L, LUA_UTF8LIBNAME, funcs);

    lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT) / sizeof(char) - 1);
    lua_setfield(L, -2, "charpattern");

    return 1;
}
Sync to upstream/release/501 (#20) Co-authored-by: Rodactor <rodactor@roblox.com> 2021-10-29 21:25:12 +01:00			`// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details`
			`// This code is based on Lua 5.x implementation licensed under MIT License; see lua_LICENSE.txt for details`
			`#include "lualib.h"`

			`#include "lcommon.h"`

			`#define MAXUNICODE 0x10FFFF`

			`#define iscont(p) ((*(p)&0xC0) == 0x80)`

			`/* from strlib */`
			`/* translate a relative string position: negative means back from end */`
			`static int u_posrelat(int pos, size_t len)`
			`{`
			`if (pos >= 0)`
			`return pos;`
			`else if (0u - (size_t)pos > len)`
			`return 0;`
			`else`
			`return (int)len + pos + 1;`
			`}`

			`/*`
			`** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.`
			`*/`
			`static const char* utf8_decode(const char* o, int* val)`
			`{`
			`static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};`
			`const unsigned char* s = (const unsigned char*)o;`
			`unsigned int c = s[0];`
			`unsigned int res = 0; /* final result */`
			`if (c < 0x80) /* ascii? */`
			`res = c;`
			`else`
			`{`
			`int count = 0; /* to count number of continuation bytes */`
			`while (c & 0x40)`
			`{ /* still have continuation bytes? */`
			`int cc = s[++count]; /* read next byte */`
			`if ((cc & 0xC0) != 0x80) /* not a continuation byte? */`
			`return NULL; /* invalid byte sequence */`
			`res = (res << 6) \| (cc & 0x3F); /* add lower 6 bits from cont. byte */`
			`c <<= 1; /* to test next bit */`
			`}`
			`res \|= ((c & 0x7F) << (count * 5)); /* add first byte */`
			`if (count > 3 \|\| res > MAXUNICODE \|\| res <= limits[count])`
			`return NULL; /* invalid byte sequence */`
			`s += count; /* skip continuation bytes read */`
			`}`
			`if (val)`
			`*val = res;`
			`return (const char)s + 1; / +1 to include first byte */`
			`}`

			`/*`
			`** utf8len(s [, i [, j]]) --> number of characters that start in the`
			`** range [i,j], or nil + current position if 's' is not well formed in`
			`** that interval`
			`*/`
			`static int utflen(lua_State* L)`
			`{`
			`int n = 0;`
			`size_t len;`
			`const char* s = luaL_checklstring(L, 1, &len);`
			`int posi = u_posrelat(luaL_optinteger(L, 2, 1), len);`
			`int posj = u_posrelat(luaL_optinteger(L, 3, -1), len);`
			`luaL_argcheck(L, 1 <= posi && --posi <= (int)len, 2, "initial position out of string");`
			`luaL_argcheck(L, --posj < (int)len, 3, "final position out of string");`
			`while (posi <= posj)`
			`{`
			`const char* s1 = utf8_decode(s + posi, NULL);`
			`if (s1 == NULL)`
			`{ /* conversion error? */`
			`lua_pushnil(L); /* return nil ... */`
			`lua_pushinteger(L, posi + 1); /* ... and current position */`
			`return 2;`
			`}`
			`posi = (int)(s1 - s);`
			`n++;`
			`}`
			`lua_pushinteger(L, n);`
			`return 1;`
			`}`

			`/*`
			`** codepoint(s, [i, [j]]) -> returns codepoints for all characters`
			`** that start in the range [i,j]`
			`*/`
			`static int codepoint(lua_State* L)`
			`{`
			`size_t len;`
			`const char* s = luaL_checklstring(L, 1, &len);`
			`int posi = u_posrelat(luaL_optinteger(L, 2, 1), len);`
			`int pose = u_posrelat(luaL_optinteger(L, 3, posi), len);`
			`int n;`
			`const char* se;`
			`luaL_argcheck(L, posi >= 1, 2, "out of range");`
			`luaL_argcheck(L, pose <= (int)len, 3, "out of range");`
			`if (posi > pose)`
			`return 0; /* empty interval; return no values */`
			`if (pose - posi >= INT_MAX) /* (int -> int) overflow? */`
			`luaL_error(L, "string slice too long");`
			`n = (int)(pose - posi) + 1;`
			`luaL_checkstack(L, n, "string slice too long");`
			`n = 0;`
			`se = s + pose;`
			`for (s += posi - 1; s < se;)`
			`{`
			`int code;`
			`s = utf8_decode(s, &code);`
			`if (s == NULL)`
			`luaL_error(L, "invalid UTF-8 code");`
			`lua_pushinteger(L, code);`
			`n++;`
			`}`
			`return n;`
			`}`

			`// from Lua 5.3 lobject.h`
			`#define UTF8BUFFSZ 8`

			`// from Lua 5.3 lobject.c, copied verbatim + static`
			`static int luaO_utf8esc(char* buff, unsigned long x)`
			`{`
			`int n = 1; /* number of bytes put in buffer (backwards) */`
			`LUAU_ASSERT(x <= 0x10FFFF);`
			`if (x < 0x80) /* ascii? */`
			`buff[UTF8BUFFSZ - 1] = cast_to(char, x);`
			`else`
			`{ /* need continuation bytes */`
			`unsigned int mfb = 0x3f; /* maximum that fits in first byte */`
			`do`
			`{ /* add continuation bytes */`
			`buff[UTF8BUFFSZ - (n++)] = cast_to(char, 0x80 \| (x & 0x3f));`
			`x >>= 6; /* remove added bits */`
			`mfb >>= 1; /* now there is one less bit available in first byte */`
			`} while (x > mfb); /* still needs continuation byte? */`
			`buff[UTF8BUFFSZ - n] = cast_to(char, (~mfb << 1) \| x); /* add first byte */`
			`}`
			`return n;`
			`}`

			`// lighter replacement for pushutfchar; doesn't push any string onto the stack`
			`static int buffutfchar(lua_State* L, int arg, char* buff, const char** charstr)`
			`{`
			`int code = luaL_checkinteger(L, arg);`
			`luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");`
			`int l = luaO_utf8esc(buff, cast_to(long, code));`
			`*charstr = buff + UTF8BUFFSZ - l;`
			`return l;`
			`}`

			`/*`
			`** utfchar(n1, n2, ...) -> char(n1)..char(n2)...`
			`**`
			`** This version avoids the need to make more invasive upgrades elsewhere (like`
			`** implementing the %U escape in lua_pushfstring) and avoids pushing string`
			`** objects for each codepoint in the multi-argument case. -Jovanni`
			`*/`
			`static int utfchar(lua_State* L)`
			`{`
			`char buff[UTF8BUFFSZ];`
			`const char* charstr;`

			`int n = lua_gettop(L); /* number of arguments */`
			`if (n == 1)`
			`{ /* optimize common case of single char */`
			`int l = buffutfchar(L, 1, buff, &charstr);`
			`lua_pushlstring(L, charstr, l);`
			`}`
			`else`
			`{`
			`luaL_Buffer b;`
			`luaL_buffinit(L, &b);`
			`for (int i = 1; i <= n; i++)`
			`{`
			`int l = buffutfchar(L, i, buff, &charstr);`
			`luaL_addlstring(&b, charstr, l);`
			`}`
			`luaL_pushresult(&b);`
			`}`
			`return 1;`
			`}`

			`/*`
			`** offset(s, n, [i]) -> index where n-th character counting from`
			`** position 'i' starts; 0 means character at 'i'.`
			`*/`
			`static int byteoffset(lua_State* L)`
			`{`
			`size_t len;`
			`const char* s = luaL_checklstring(L, 1, &len);`
			`int n = luaL_checkinteger(L, 2);`
			`int posi = (n >= 0) ? 1 : (int)len + 1;`
			`posi = u_posrelat(luaL_optinteger(L, 3, posi), len);`
			`luaL_argcheck(L, 1 <= posi && --posi <= (int)len, 3, "position out of range");`
			`if (n == 0)`
			`{`
			`/* find beginning of current byte sequence */`
			`while (posi > 0 && iscont(s + posi))`
			`posi--;`
			`}`
			`else`
			`{`
			`if (iscont(s + posi))`
			`luaL_error(L, "initial position is a continuation byte");`
			`if (n < 0)`
			`{`
			`while (n < 0 && posi > 0)`
			`{ /* move back */`
			`do`
			`{ /* find beginning of previous character */`
			`posi--;`
			`} while (posi > 0 && iscont(s + posi));`
			`n++;`
			`}`
			`}`
			`else`
			`{`
			`n--; /* do not move for 1st character */`
			`while (n > 0 && posi < (int)len)`
			`{`
			`do`
			`{ /* find beginning of next character */`
			`posi++;`
			`} while (iscont(s + posi)); /* (cannot pass final '\0') */`
			`n--;`
			`}`
			`}`
			`}`
			`if (n == 0) /* did it find given character? */`
			`lua_pushinteger(L, posi + 1);`
			`else /* no such character */`
			`lua_pushnil(L);`
			`return 1;`
			`}`

			`static int iter_aux(lua_State* L)`
			`{`
			`size_t len;`
			`const char* s = luaL_checklstring(L, 1, &len);`
			`int n = lua_tointeger(L, 2) - 1;`
			`if (n < 0) /* first iteration? */`
			`n = 0; /* start from here */`
			`else if (n < (int)len)`
			`{`
			`n++; /* skip current byte */`
			`while (iscont(s + n))`
			`n++; /* and its continuations */`
			`}`
			`if (n >= (int)len)`
			`return 0; /* no more codepoints */`
			`else`
			`{`
			`int code;`
			`const char* next = utf8_decode(s + n, &code);`
			`if (next == NULL \|\| iscont(next))`
			`luaL_error(L, "invalid UTF-8 code");`
			`lua_pushinteger(L, n + 1);`
			`lua_pushinteger(L, code);`
			`return 2;`
			`}`
			`}`

			`static int iter_codes(lua_State* L)`
			`{`
			`luaL_checkstring(L, 1);`
Sync to upstream/release/505 2021-11-18 22:21:07 +00:00			`lua_pushcfunction(L, iter_aux, NULL);`
Sync to upstream/release/501 (#20) Co-authored-by: Rodactor <rodactor@roblox.com> 2021-10-29 21:25:12 +01:00			`lua_pushvalue(L, 1);`
			`lua_pushinteger(L, 0);`
			`return 3;`
			`}`

			`/* pattern to match a single UTF-8 character */`
			`#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"`

			`static const luaL_Reg funcs[] = {`
			`{"offset", byteoffset},`
			`{"codepoint", codepoint},`
			`{"char", utfchar},`
			`{"len", utflen},`
			`{"codes", iter_codes},`
			`{NULL, NULL},`
			`};`

			`LUALIB_API int luaopen_utf8(lua_State* L)`
			`{`
			`luaL_register(L, LUA_UTF8LIBNAME, funcs);`

			`lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT) / sizeof(char) - 1);`
			`lua_setfield(L, -2, "charpattern");`

			`return 1;`
			`}`