2021-10-29 21:25:12 +01:00
|
|
|
// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
|
|
|
|
// This code is based on Lua 5.x implementation licensed under MIT License; see lua_LICENSE.txt for details
|
|
|
|
#include "lualib.h"
|
|
|
|
|
|
|
|
#include "lcommon.h"
|
|
|
|
|
|
|
|
#define MAXUNICODE 0x10FFFF
|
|
|
|
|
|
|
|
#define iscont(p) ((*(p)&0xC0) == 0x80)
|
|
|
|
|
2022-08-04 23:35:33 +01:00
|
|
|
// from strlib
|
|
|
|
// translate a relative string position: negative means back from end
|
2021-10-29 21:25:12 +01:00
|
|
|
static int u_posrelat(int pos, size_t len)
|
|
|
|
{
|
|
|
|
if (pos >= 0)
|
|
|
|
return pos;
|
|
|
|
else if (0u - (size_t)pos > len)
|
|
|
|
return 0;
|
|
|
|
else
|
|
|
|
return (int)len + pos + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
** Decode one UTF-8 sequence, returning NULL if byte sequence is invalid.
|
|
|
|
*/
|
|
|
|
static const char* utf8_decode(const char* o, int* val)
|
|
|
|
{
|
|
|
|
static const unsigned int limits[] = {0xFF, 0x7F, 0x7FF, 0xFFFF};
|
|
|
|
const unsigned char* s = (const unsigned char*)o;
|
|
|
|
unsigned int c = s[0];
|
2022-08-04 23:35:33 +01:00
|
|
|
unsigned int res = 0; // final result
|
|
|
|
if (c < 0x80) // ascii?
|
2021-10-29 21:25:12 +01:00
|
|
|
res = c;
|
|
|
|
else
|
|
|
|
{
|
2022-08-04 23:35:33 +01:00
|
|
|
int count = 0; // to count number of continuation bytes
|
2021-10-29 21:25:12 +01:00
|
|
|
while (c & 0x40)
|
2022-08-04 23:35:33 +01:00
|
|
|
{ // still have continuation bytes?
|
|
|
|
int cc = s[++count]; // read next byte
|
|
|
|
if ((cc & 0xC0) != 0x80) // not a continuation byte?
|
|
|
|
return NULL; // invalid byte sequence
|
|
|
|
res = (res << 6) | (cc & 0x3F); // add lower 6 bits from cont. byte
|
|
|
|
c <<= 1; // to test next bit
|
2021-10-29 21:25:12 +01:00
|
|
|
}
|
2022-08-04 23:35:33 +01:00
|
|
|
res |= ((c & 0x7F) << (count * 5)); // add first byte
|
2021-10-29 21:25:12 +01:00
|
|
|
if (count > 3 || res > MAXUNICODE || res <= limits[count])
|
2022-08-04 23:35:33 +01:00
|
|
|
return NULL; // invalid byte sequence
|
|
|
|
s += count; // skip continuation bytes read
|
2021-10-29 21:25:12 +01:00
|
|
|
}
|
|
|
|
if (val)
|
|
|
|
*val = res;
|
2022-08-04 23:35:33 +01:00
|
|
|
return (const char*)s + 1; // +1 to include first byte
|
2021-10-29 21:25:12 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
** utf8len(s [, i [, j]]) --> number of characters that start in the
|
|
|
|
** range [i,j], or nil + current position if 's' is not well formed in
|
|
|
|
** that interval
|
|
|
|
*/
|
|
|
|
static int utflen(lua_State* L)
|
|
|
|
{
|
|
|
|
int n = 0;
|
|
|
|
size_t len;
|
|
|
|
const char* s = luaL_checklstring(L, 1, &len);
|
|
|
|
int posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
|
|
|
|
int posj = u_posrelat(luaL_optinteger(L, 3, -1), len);
|
|
|
|
luaL_argcheck(L, 1 <= posi && --posi <= (int)len, 2, "initial position out of string");
|
|
|
|
luaL_argcheck(L, --posj < (int)len, 3, "final position out of string");
|
|
|
|
while (posi <= posj)
|
|
|
|
{
|
|
|
|
const char* s1 = utf8_decode(s + posi, NULL);
|
|
|
|
if (s1 == NULL)
|
2022-08-04 23:35:33 +01:00
|
|
|
{ // conversion error?
|
|
|
|
lua_pushnil(L); // return nil ...
|
|
|
|
lua_pushinteger(L, posi + 1); // ... and current position
|
2021-10-29 21:25:12 +01:00
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
posi = (int)(s1 - s);
|
|
|
|
n++;
|
|
|
|
}
|
|
|
|
lua_pushinteger(L, n);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
** codepoint(s, [i, [j]]) -> returns codepoints for all characters
|
|
|
|
** that start in the range [i,j]
|
|
|
|
*/
|
|
|
|
static int codepoint(lua_State* L)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
const char* s = luaL_checklstring(L, 1, &len);
|
|
|
|
int posi = u_posrelat(luaL_optinteger(L, 2, 1), len);
|
|
|
|
int pose = u_posrelat(luaL_optinteger(L, 3, posi), len);
|
|
|
|
int n;
|
|
|
|
const char* se;
|
|
|
|
luaL_argcheck(L, posi >= 1, 2, "out of range");
|
|
|
|
luaL_argcheck(L, pose <= (int)len, 3, "out of range");
|
|
|
|
if (posi > pose)
|
2022-08-04 23:35:33 +01:00
|
|
|
return 0; // empty interval; return no values
|
|
|
|
if (pose - posi >= INT_MAX) // (int -> int) overflow?
|
2021-10-29 21:25:12 +01:00
|
|
|
luaL_error(L, "string slice too long");
|
|
|
|
n = (int)(pose - posi) + 1;
|
|
|
|
luaL_checkstack(L, n, "string slice too long");
|
|
|
|
n = 0;
|
|
|
|
se = s + pose;
|
|
|
|
for (s += posi - 1; s < se;)
|
|
|
|
{
|
|
|
|
int code;
|
|
|
|
s = utf8_decode(s, &code);
|
|
|
|
if (s == NULL)
|
|
|
|
luaL_error(L, "invalid UTF-8 code");
|
|
|
|
lua_pushinteger(L, code);
|
|
|
|
n++;
|
|
|
|
}
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
|
|
|
// from Lua 5.3 lobject.h
|
|
|
|
#define UTF8BUFFSZ 8
|
|
|
|
|
|
|
|
// from Lua 5.3 lobject.c, copied verbatim + static
|
|
|
|
static int luaO_utf8esc(char* buff, unsigned long x)
|
|
|
|
{
|
2022-08-04 23:35:33 +01:00
|
|
|
int n = 1; // number of bytes put in buffer (backwards)
|
2021-10-29 21:25:12 +01:00
|
|
|
LUAU_ASSERT(x <= 0x10FFFF);
|
2022-08-04 23:35:33 +01:00
|
|
|
if (x < 0x80) // ascii?
|
2021-10-29 21:25:12 +01:00
|
|
|
buff[UTF8BUFFSZ - 1] = cast_to(char, x);
|
|
|
|
else
|
2022-08-04 23:35:33 +01:00
|
|
|
{ // need continuation bytes
|
|
|
|
unsigned int mfb = 0x3f; // maximum that fits in first byte
|
2021-10-29 21:25:12 +01:00
|
|
|
do
|
2022-08-04 23:35:33 +01:00
|
|
|
{ // add continuation bytes
|
2021-10-29 21:25:12 +01:00
|
|
|
buff[UTF8BUFFSZ - (n++)] = cast_to(char, 0x80 | (x & 0x3f));
|
2022-08-04 23:35:33 +01:00
|
|
|
x >>= 6; // remove added bits
|
|
|
|
mfb >>= 1; // now there is one less bit available in first byte
|
|
|
|
} while (x > mfb); // still needs continuation byte?
|
|
|
|
buff[UTF8BUFFSZ - n] = cast_to(char, (~mfb << 1) | x); // add first byte
|
2021-10-29 21:25:12 +01:00
|
|
|
}
|
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
|
|
|
// lighter replacement for pushutfchar; doesn't push any string onto the stack
|
|
|
|
static int buffutfchar(lua_State* L, int arg, char* buff, const char** charstr)
|
|
|
|
{
|
|
|
|
int code = luaL_checkinteger(L, arg);
|
|
|
|
luaL_argcheck(L, 0 <= code && code <= MAXUNICODE, arg, "value out of range");
|
|
|
|
int l = luaO_utf8esc(buff, cast_to(long, code));
|
|
|
|
*charstr = buff + UTF8BUFFSZ - l;
|
|
|
|
return l;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
** utfchar(n1, n2, ...) -> char(n1)..char(n2)...
|
|
|
|
**
|
|
|
|
** This version avoids the need to make more invasive upgrades elsewhere (like
|
|
|
|
** implementing the %U escape in lua_pushfstring) and avoids pushing string
|
|
|
|
** objects for each codepoint in the multi-argument case. -Jovanni
|
|
|
|
*/
|
|
|
|
static int utfchar(lua_State* L)
|
|
|
|
{
|
|
|
|
char buff[UTF8BUFFSZ];
|
|
|
|
const char* charstr;
|
|
|
|
|
2022-08-04 23:35:33 +01:00
|
|
|
int n = lua_gettop(L); // number of arguments
|
2021-10-29 21:25:12 +01:00
|
|
|
if (n == 1)
|
2022-08-04 23:35:33 +01:00
|
|
|
{ // optimize common case of single char
|
2021-10-29 21:25:12 +01:00
|
|
|
int l = buffutfchar(L, 1, buff, &charstr);
|
|
|
|
lua_pushlstring(L, charstr, l);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
luaL_Buffer b;
|
|
|
|
luaL_buffinit(L, &b);
|
|
|
|
for (int i = 1; i <= n; i++)
|
|
|
|
{
|
|
|
|
int l = buffutfchar(L, i, buff, &charstr);
|
|
|
|
luaL_addlstring(&b, charstr, l);
|
|
|
|
}
|
|
|
|
luaL_pushresult(&b);
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
** offset(s, n, [i]) -> index where n-th character counting from
|
|
|
|
** position 'i' starts; 0 means character at 'i'.
|
|
|
|
*/
|
|
|
|
static int byteoffset(lua_State* L)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
const char* s = luaL_checklstring(L, 1, &len);
|
|
|
|
int n = luaL_checkinteger(L, 2);
|
|
|
|
int posi = (n >= 0) ? 1 : (int)len + 1;
|
|
|
|
posi = u_posrelat(luaL_optinteger(L, 3, posi), len);
|
|
|
|
luaL_argcheck(L, 1 <= posi && --posi <= (int)len, 3, "position out of range");
|
|
|
|
if (n == 0)
|
|
|
|
{
|
2022-08-04 23:35:33 +01:00
|
|
|
// find beginning of current byte sequence
|
2021-10-29 21:25:12 +01:00
|
|
|
while (posi > 0 && iscont(s + posi))
|
|
|
|
posi--;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (iscont(s + posi))
|
|
|
|
luaL_error(L, "initial position is a continuation byte");
|
|
|
|
if (n < 0)
|
|
|
|
{
|
|
|
|
while (n < 0 && posi > 0)
|
2022-08-04 23:35:33 +01:00
|
|
|
{ // move back
|
2021-10-29 21:25:12 +01:00
|
|
|
do
|
2022-08-04 23:35:33 +01:00
|
|
|
{ // find beginning of previous character
|
2021-10-29 21:25:12 +01:00
|
|
|
posi--;
|
|
|
|
} while (posi > 0 && iscont(s + posi));
|
|
|
|
n++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2022-08-04 23:35:33 +01:00
|
|
|
n--; // do not move for 1st character
|
2021-10-29 21:25:12 +01:00
|
|
|
while (n > 0 && posi < (int)len)
|
|
|
|
{
|
|
|
|
do
|
2022-08-04 23:35:33 +01:00
|
|
|
{ // find beginning of next character
|
2021-10-29 21:25:12 +01:00
|
|
|
posi++;
|
2022-08-04 23:35:33 +01:00
|
|
|
} while (iscont(s + posi)); // (cannot pass final '\0')
|
2021-10-29 21:25:12 +01:00
|
|
|
n--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-08-04 23:35:33 +01:00
|
|
|
if (n == 0) // did it find given character?
|
2021-10-29 21:25:12 +01:00
|
|
|
lua_pushinteger(L, posi + 1);
|
2022-08-04 23:35:33 +01:00
|
|
|
else // no such character
|
2021-10-29 21:25:12 +01:00
|
|
|
lua_pushnil(L);
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int iter_aux(lua_State* L)
|
|
|
|
{
|
|
|
|
size_t len;
|
|
|
|
const char* s = luaL_checklstring(L, 1, &len);
|
|
|
|
int n = lua_tointeger(L, 2) - 1;
|
2022-08-04 23:35:33 +01:00
|
|
|
if (n < 0) // first iteration?
|
|
|
|
n = 0; // start from here
|
2021-10-29 21:25:12 +01:00
|
|
|
else if (n < (int)len)
|
|
|
|
{
|
2022-08-04 23:35:33 +01:00
|
|
|
n++; // skip current byte
|
2021-10-29 21:25:12 +01:00
|
|
|
while (iscont(s + n))
|
2022-08-04 23:35:33 +01:00
|
|
|
n++; // and its continuations
|
2021-10-29 21:25:12 +01:00
|
|
|
}
|
|
|
|
if (n >= (int)len)
|
2022-08-04 23:35:33 +01:00
|
|
|
return 0; // no more codepoints
|
2021-10-29 21:25:12 +01:00
|
|
|
else
|
|
|
|
{
|
|
|
|
int code;
|
|
|
|
const char* next = utf8_decode(s + n, &code);
|
|
|
|
if (next == NULL || iscont(next))
|
|
|
|
luaL_error(L, "invalid UTF-8 code");
|
|
|
|
lua_pushinteger(L, n + 1);
|
|
|
|
lua_pushinteger(L, code);
|
|
|
|
return 2;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static int iter_codes(lua_State* L)
|
|
|
|
{
|
|
|
|
luaL_checkstring(L, 1);
|
2021-11-19 16:10:07 +00:00
|
|
|
lua_pushcfunction(L, iter_aux, NULL);
|
2021-10-29 21:25:12 +01:00
|
|
|
lua_pushvalue(L, 1);
|
|
|
|
lua_pushinteger(L, 0);
|
|
|
|
return 3;
|
|
|
|
}
|
|
|
|
|
2022-08-04 23:35:33 +01:00
|
|
|
// pattern to match a single UTF-8 character
|
2021-10-29 21:25:12 +01:00
|
|
|
#define UTF8PATT "[\0-\x7F\xC2-\xF4][\x80-\xBF]*"
|
|
|
|
|
|
|
|
static const luaL_Reg funcs[] = {
|
|
|
|
{"offset", byteoffset},
|
|
|
|
{"codepoint", codepoint},
|
|
|
|
{"char", utfchar},
|
|
|
|
{"len", utflen},
|
|
|
|
{"codes", iter_codes},
|
|
|
|
{NULL, NULL},
|
|
|
|
};
|
|
|
|
|
2021-11-29 16:14:06 +00:00
|
|
|
int luaopen_utf8(lua_State* L)
|
2021-10-29 21:25:12 +01:00
|
|
|
{
|
|
|
|
luaL_register(L, LUA_UTF8LIBNAME, funcs);
|
|
|
|
|
|
|
|
lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT) / sizeof(char) - 1);
|
|
|
|
lua_setfield(L, -2, "charpattern");
|
|
|
|
|
|
|
|
return 1;
|
|
|
|
}
|