luau/Ast/src/Lexer.cpp
2022-02-17 17:18:01 -08:00

1139 lines
25 KiB
C++

// This file is part of the Luau programming language and is licensed under MIT License; see LICENSE.txt for details
#include "Luau/Lexer.h"
#include "Luau/Confusables.h"
#include "Luau/StringUtils.h"
#include <limits.h>
namespace Luau
{
Allocator::Allocator()
: root(static_cast<Page*>(operator new(sizeof(Page))))
, offset(0)
{
root->next = nullptr;
}
Allocator::Allocator(Allocator&& rhs)
: root(rhs.root)
, offset(rhs.offset)
{
rhs.root = nullptr;
rhs.offset = 0;
}
Allocator::~Allocator()
{
Page* page = root;
while (page)
{
Page* next = page->next;
operator delete(page);
page = next;
}
}
void* Allocator::allocate(size_t size)
{
constexpr size_t align = alignof(void*) > alignof(double) ? alignof(void*) : alignof(double);
if (root)
{
uintptr_t data = reinterpret_cast<uintptr_t>(root->data);
uintptr_t result = (data + offset + align - 1) & ~(align - 1);
if (result + size <= data + sizeof(root->data))
{
offset = result - data + size;
return reinterpret_cast<void*>(result);
}
}
// allocate new page
size_t pageSize = size > sizeof(root->data) ? size : sizeof(root->data);
void* pageData = operator new(offsetof(Page, data) + pageSize);
Page* page = static_cast<Page*>(pageData);
page->next = root;
root = page;
offset = size;
return page->data;
}
Lexeme::Lexeme(const Location& location, Type type)
: type(type)
, location(location)
, length(0)
, data(nullptr)
{
}
Lexeme::Lexeme(const Location& location, char character)
: type(static_cast<Type>(static_cast<unsigned char>(character)))
, location(location)
, length(0)
, data(nullptr)
{
}
Lexeme::Lexeme(const Location& location, Type type, const char* data, size_t size)
: type(type)
, location(location)
, length(unsigned(size))
, data(data)
{
LUAU_ASSERT(type == RawString || type == QuotedString || type == Number || type == Comment || type == BlockComment);
}
Lexeme::Lexeme(const Location& location, Type type, const char* name)
: type(type)
, location(location)
, length(0)
, name(name)
{
LUAU_ASSERT(type == Name || (type >= Reserved_BEGIN && type < Lexeme::Reserved_END));
}
static const char* kReserved[] = {"and", "break", "do", "else", "elseif", "end", "false", "for", "function", "if", "in", "local", "nil", "not", "or",
"repeat", "return", "then", "true", "until", "while"};
std::string Lexeme::toString() const
{
switch (type)
{
case Eof:
return "<eof>";
case Equal:
return "'=='";
case LessEqual:
return "'<='";
case GreaterEqual:
return "'>='";
case NotEqual:
return "'~='";
case Dot2:
return "'..'";
case Dot3:
return "'...'";
case SkinnyArrow:
return "'->'";
case DoubleColon:
return "'::'";
case AddAssign:
return "'+='";
case SubAssign:
return "'-='";
case MulAssign:
return "'*='";
case DivAssign:
return "'/='";
case ModAssign:
return "'%='";
case PowAssign:
return "'^='";
case ConcatAssign:
return "'..='";
case RawString:
case QuotedString:
return data ? format("\"%.*s\"", length, data) : "string";
case Number:
return data ? format("'%.*s'", length, data) : "number";
case Name:
return name ? format("'%s'", name) : "identifier";
case Comment:
return "comment";
case BrokenString:
return "malformed string";
case BrokenComment:
return "unfinished comment";
case BrokenUnicode:
if (codepoint)
{
if (const char* confusable = findConfusable(codepoint))
return format("Unicode character U+%x (did you mean '%s'?)", codepoint, confusable);
return format("Unicode character U+%x", codepoint);
}
else
{
return "invalid UTF-8 sequence";
}
default:
if (type < Char_END)
return format("'%c'", type);
else if (type >= Reserved_BEGIN && type < Reserved_END)
return format("'%s'", kReserved[type - Reserved_BEGIN]);
else
return "<unknown>";
}
}
bool AstNameTable::Entry::operator==(const Entry& other) const
{
return length == other.length && memcmp(value.value, other.value.value, length) == 0;
}
size_t AstNameTable::EntryHash::operator()(const Entry& e) const
{
// FNV1a
uint32_t hash = 2166136261;
for (size_t i = 0; i < e.length; ++i)
{
hash ^= uint8_t(e.value.value[i]);
hash *= 16777619;
}
return hash;
}
AstNameTable::AstNameTable(Allocator& allocator)
: data({AstName(""), 0, Lexeme::Eof}, 128)
, allocator(allocator)
{
static_assert(sizeof(kReserved) / sizeof(kReserved[0]) == Lexeme::Reserved_END - Lexeme::Reserved_BEGIN);
for (int i = Lexeme::Reserved_BEGIN; i < Lexeme::Reserved_END; ++i)
addStatic(kReserved[i - Lexeme::Reserved_BEGIN], static_cast<Lexeme::Type>(i));
}
AstName AstNameTable::addStatic(const char* name, Lexeme::Type type)
{
AstNameTable::Entry entry = {AstName(name), uint32_t(strlen(name)), type};
LUAU_ASSERT(!data.contains(entry));
data.insert(entry);
return entry.value;
}
std::pair<AstName, Lexeme::Type> AstNameTable::getOrAddWithType(const char* name, size_t length)
{
AstNameTable::Entry key = {AstName(name), uint32_t(length), Lexeme::Eof};
const Entry& entry = data.insert(key);
// entry already was inserted
if (entry.type != Lexeme::Eof)
return std::make_pair(entry.value, entry.type);
// we just inserted an entry with a non-owned pointer into the map
// we need to correct it, *but* we need to be careful about not disturbing the hash value
char* nameData = static_cast<char*>(allocator.allocate(length + 1));
memcpy(nameData, name, length);
nameData[length] = 0;
const_cast<Entry&>(entry).value = AstName(nameData);
const_cast<Entry&>(entry).type = Lexeme::Name;
return std::make_pair(entry.value, entry.type);
}
std::pair<AstName, Lexeme::Type> AstNameTable::getWithType(const char* name, size_t length) const
{
if (const Entry* entry = data.find({AstName(name), uint32_t(length), Lexeme::Eof}))
{
return std::make_pair(entry->value, entry->type);
}
return std::make_pair(AstName(), Lexeme::Name);
}
AstName AstNameTable::getOrAdd(const char* name)
{
return getOrAddWithType(name, strlen(name)).first;
}
AstName AstNameTable::get(const char* name) const
{
return getWithType(name, strlen(name)).first;
}
inline bool isAlpha(char ch)
{
// use or trick to convert to lower case and unsigned comparison to do range check
return unsigned((ch | ' ') - 'a') < 26;
}
inline bool isDigit(char ch)
{
return unsigned(ch - '0') < 10;
}
inline bool isHexDigit(char ch)
{
// use or trick to convert to lower case and unsigned comparison to do range check
return unsigned(ch - '0') < 10 || unsigned((ch | ' ') - 'a') < 6;
}
inline bool isNewline(char ch)
{
return ch == '\n';
}
static char unescape(char ch)
{
switch (ch)
{
case 'a':
return '\a';
case 'b':
return '\b';
case 'f':
return '\f';
case 'n':
return '\n';
case 'r':
return '\r';
case 't':
return '\t';
case 'v':
return '\v';
default:
return ch;
}
}
Lexer::Lexer(const char* buffer, size_t bufferSize, AstNameTable& names)
: buffer(buffer)
, bufferSize(bufferSize)
, offset(0)
, line(0)
, lineOffset(0)
, lexeme(Location(Position(0, 0), 0), Lexeme::Eof)
, names(names)
, skipComments(false)
, readNames(true)
{
}
void Lexer::setSkipComments(bool skip)
{
skipComments = skip;
}
void Lexer::setReadNames(bool read)
{
readNames = read;
}
const Lexeme& Lexer::next()
{
return next(this->skipComments);
}
const Lexeme& Lexer::next(bool skipComments)
{
// in skipComments mode we reject valid comments
do
{
// consume whitespace before the token
while (isSpace(peekch()))
consume();
prevLocation = lexeme.location;
lexeme = readNext();
} while (skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment));
return lexeme;
}
void Lexer::nextline()
{
while (peekch() != 0 && peekch() != '\r' && !isNewline(peekch()))
consume();
next();
}
Lexeme Lexer::lookahead()
{
unsigned int currentOffset = offset;
unsigned int currentLine = line;
unsigned int currentLineOffset = lineOffset;
Lexeme currentLexeme = lexeme;
Location currentPrevLocation = prevLocation;
Lexeme result = next();
offset = currentOffset;
line = currentLine;
lineOffset = currentLineOffset;
lexeme = currentLexeme;
prevLocation = currentPrevLocation;
return result;
}
bool Lexer::isReserved(const std::string& word)
{
for (int i = Lexeme::Reserved_BEGIN; i < Lexeme::Reserved_END; ++i)
if (word == kReserved[i - Lexeme::Reserved_BEGIN])
return true;
return false;
}
LUAU_FORCEINLINE
char Lexer::peekch() const
{
return (offset < bufferSize) ? buffer[offset] : 0;
}
LUAU_FORCEINLINE
char Lexer::peekch(unsigned int lookahead) const
{
return (offset + lookahead < bufferSize) ? buffer[offset + lookahead] : 0;
}
Position Lexer::position() const
{
return Position(line, offset - lineOffset);
}
void Lexer::consume()
{
if (isNewline(buffer[offset]))
{
line++;
lineOffset = offset + 1;
}
offset++;
}
Lexeme Lexer::readCommentBody()
{
Position start = position();
LUAU_ASSERT(peekch(0) == '-' && peekch(1) == '-');
consume();
consume();
size_t startOffset = offset;
if (peekch() == '[')
{
int sep = skipLongSeparator();
if (sep >= 0)
{
return readLongString(start, sep, Lexeme::BlockComment, Lexeme::BrokenComment);
}
}
// fall back to single-line comment
while (peekch() != 0 && peekch() != '\r' && !isNewline(peekch()))
consume();
return Lexeme(Location(start, position()), Lexeme::Comment, &buffer[startOffset], offset - startOffset);
}
// Given a sequence [===[ or ]===], returns:
// 1. number of equal signs (or 0 if none present) between the brackets
// 2. -1 if this is not a long comment/string separator
// 3. -N if this is a malformed separator
// Does *not* consume the closing brace.
int Lexer::skipLongSeparator()
{
char start = peekch();
LUAU_ASSERT(start == '[' || start == ']');
consume();
int count = 0;
while (peekch() == '=')
{
consume();
count++;
}
return (start == peekch()) ? count : (-count) - 1;
}
Lexeme Lexer::readLongString(const Position& start, int sep, Lexeme::Type ok, Lexeme::Type broken)
{
// skip (second) [
LUAU_ASSERT(peekch() == '[');
consume();
unsigned int startOffset = offset;
while (peekch())
{
if (peekch() == ']')
{
if (skipLongSeparator() == sep)
{
LUAU_ASSERT(peekch() == ']');
consume(); // skip (second) ]
unsigned int endOffset = offset - sep - 2;
LUAU_ASSERT(endOffset >= startOffset);
return Lexeme(Location(start, position()), ok, &buffer[startOffset], endOffset - startOffset);
}
}
else
{
consume();
}
}
return Lexeme(Location(start, position()), broken);
}
Lexeme Lexer::readQuotedString()
{
Position start = position();
char delimiter = peekch();
LUAU_ASSERT(delimiter == '\'' || delimiter == '"');
consume();
unsigned int startOffset = offset;
while (peekch() != delimiter)
{
switch (peekch())
{
case 0:
case '\r':
case '\n':
return Lexeme(Location(start, position()), Lexeme::BrokenString);
case '\\':
consume();
switch (peekch())
{
case '\r':
consume();
if (peekch() == '\n')
consume();
break;
case 0:
break;
case 'z':
consume();
while (isSpace(peekch()))
consume();
break;
default:
consume();
}
break;
default:
consume();
}
}
consume();
return Lexeme(Location(start, position()), Lexeme::QuotedString, &buffer[startOffset], offset - startOffset - 1);
}
Lexeme Lexer::readNumber(const Position& start, unsigned int startOffset)
{
LUAU_ASSERT(isDigit(peekch()));
// This function does not do the number parsing - it only skips a number-like pattern.
// It uses the same logic as Lua stock lexer; the resulting string is later converted
// to a number with proper verification.
do
{
consume();
} while (isDigit(peekch()) || peekch() == '.' || peekch() == '_');
if (peekch() == 'e' || peekch() == 'E')
{
consume();
if (peekch() == '+' || peekch() == '-')
consume();
}
while (isAlpha(peekch()) || isDigit(peekch()) || peekch() == '_')
consume();
return Lexeme(Location(start, position()), Lexeme::Number, &buffer[startOffset], offset - startOffset);
}
std::pair<AstName, Lexeme::Type> Lexer::readName()
{
LUAU_ASSERT(isAlpha(peekch()) || peekch() == '_');
unsigned int startOffset = offset;
do
consume();
while (isAlpha(peekch()) || isDigit(peekch()) || peekch() == '_');
return readNames ? names.getOrAddWithType(&buffer[startOffset], offset - startOffset)
: names.getWithType(&buffer[startOffset], offset - startOffset);
}
Lexeme Lexer::readNext()
{
Position start = position();
switch (peekch())
{
case 0:
return Lexeme(Location(start, 0), Lexeme::Eof);
case '-':
{
if (peekch(1) == '>')
{
consume();
consume();
return Lexeme(Location(start, 2), Lexeme::SkinnyArrow);
}
else if (peekch(1) == '=')
{
consume();
consume();
return Lexeme(Location(start, 2), Lexeme::SubAssign);
}
else if (peekch(1) == '-')
{
return readCommentBody();
}
else
{
consume();
return Lexeme(Location(start, 1), '-');
}
}
case '[':
{
int sep = skipLongSeparator();
if (sep >= 0)
{
return readLongString(start, sep, Lexeme::RawString, Lexeme::BrokenString);
}
else if (sep == -1)
{
return Lexeme(Location(start, 1), '[');
}
else
{
return Lexeme(Location(start, position()), Lexeme::BrokenString);
}
}
case '=':
{
consume();
if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 2), Lexeme::Equal);
}
else
return Lexeme(Location(start, 1), '=');
}
case '<':
{
consume();
if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 2), Lexeme::LessEqual);
}
else
return Lexeme(Location(start, 1), '<');
}
case '>':
{
consume();
if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 2), Lexeme::GreaterEqual);
}
else
return Lexeme(Location(start, 1), '>');
}
case '~':
{
consume();
if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 2), Lexeme::NotEqual);
}
else
return Lexeme(Location(start, 1), '~');
}
case '"':
case '\'':
return readQuotedString();
case '.':
consume();
if (peekch() == '.')
{
consume();
if (peekch() == '.')
{
consume();
return Lexeme(Location(start, 3), Lexeme::Dot3);
}
else if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 3), Lexeme::ConcatAssign);
}
else
return Lexeme(Location(start, 2), Lexeme::Dot2);
}
else
{
if (isDigit(peekch()))
{
return readNumber(start, offset - 1);
}
else
return Lexeme(Location(start, 1), '.');
}
case '+':
consume();
if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 2), Lexeme::AddAssign);
}
else
return Lexeme(Location(start, 1), '+');
case '/':
consume();
if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 2), Lexeme::DivAssign);
}
else
return Lexeme(Location(start, 1), '/');
case '*':
consume();
if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 2), Lexeme::MulAssign);
}
else
return Lexeme(Location(start, 1), '*');
case '%':
consume();
if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 2), Lexeme::ModAssign);
}
else
return Lexeme(Location(start, 1), '%');
case '^':
consume();
if (peekch() == '=')
{
consume();
return Lexeme(Location(start, 2), Lexeme::PowAssign);
}
else
return Lexeme(Location(start, 1), '^');
case ':':
{
consume();
if (peekch() == ':')
{
consume();
return Lexeme(Location(start, 2), Lexeme::DoubleColon);
}
else
return Lexeme(Location(start, 1), ':');
}
case '(':
case ')':
case '{':
case '}':
case ']':
case ';':
case ',':
case '#':
{
char ch = peekch();
consume();
return Lexeme(Location(start, 1), ch);
}
default:
if (isDigit(peekch()))
{
return readNumber(start, offset);
}
else if (isAlpha(peekch()) || peekch() == '_')
{
std::pair<AstName, Lexeme::Type> name = readName();
return Lexeme(Location(start, position()), name.second, name.first.value);
}
else if (peekch() & 0x80)
{
return readUtf8Error();
}
else
{
char ch = peekch();
consume();
return Lexeme(Location(start, 1), ch);
}
}
}
LUAU_NOINLINE Lexeme Lexer::readUtf8Error()
{
Position start = position();
uint32_t codepoint = 0;
int size = 0;
if ((peekch() & 0b10000000) == 0b00000000)
{
size = 1;
codepoint = peekch() & 0x7F;
}
else if ((peekch() & 0b11100000) == 0b11000000)
{
size = 2;
codepoint = peekch() & 0b11111;
}
else if ((peekch() & 0b11110000) == 0b11100000)
{
size = 3;
codepoint = peekch() & 0b1111;
}
else if ((peekch() & 0b11111000) == 0b11110000)
{
size = 4;
codepoint = peekch() & 0b111;
}
else
{
consume();
return Lexeme(Location(start, position()), Lexeme::BrokenUnicode);
}
consume();
for (int i = 1; i < size; ++i)
{
if ((peekch() & 0b11000000) != 0b10000000)
return Lexeme(Location(start, position()), Lexeme::BrokenUnicode);
codepoint = codepoint << 6;
codepoint |= (peekch() & 0b00111111);
consume();
}
Lexeme result(Location(start, position()), Lexeme::BrokenUnicode);
result.codepoint = codepoint;
return result;
}
static size_t toUtf8(char* data, unsigned int code)
{
// U+0000..U+007F
if (code < 0x80)
{
data[0] = char(code);
return 1;
}
// U+0080..U+07FF
else if (code < 0x800)
{
data[0] = char(0xC0 | (code >> 6));
data[1] = char(0x80 | (code & 0x3F));
return 2;
}
// U+0800..U+FFFF
else if (code < 0x10000)
{
data[0] = char(0xE0 | (code >> 12));
data[1] = char(0x80 | ((code >> 6) & 0x3F));
data[2] = char(0x80 | (code & 0x3F));
return 3;
}
// U+10000..U+10FFFF
else if (code < 0x110000)
{
data[0] = char(0xF0 | (code >> 18));
data[1] = char(0x80 | ((code >> 12) & 0x3F));
data[2] = char(0x80 | ((code >> 6) & 0x3F));
data[3] = char(0x80 | (code & 0x3F));
return 4;
}
else
{
return 0;
}
}
bool Lexer::fixupQuotedString(std::string& data)
{
if (data.empty() || data.find('\\') == std::string::npos)
return true;
size_t size = data.size();
size_t write = 0;
for (size_t i = 0; i < size;)
{
if (data[i] != '\\')
{
data[write++] = data[i];
i++;
continue;
}
if (i + 1 == size)
return false;
char escape = data[i + 1];
i += 2; // skip \e
switch (escape)
{
case '\n':
data[write++] = '\n';
break;
case '\r':
data[write++] = '\n';
if (i < size && data[i] == '\n')
i++;
break;
case 0:
return false;
case 'x':
{
// hex escape codes are exactly 2 hex digits long
if (i + 2 > size)
return false;
unsigned int code = 0;
for (int j = 0; j < 2; ++j)
{
char ch = data[i + j];
if (!isHexDigit(ch))
return false;
// use or trick to convert to lower case
code = 16 * code + (isDigit(ch) ? ch - '0' : (ch | ' ') - 'a' + 10);
}
data[write++] = char(code);
i += 2;
break;
}
case 'z':
{
while (i < size && isSpace(data[i]))
i++;
break;
}
case 'u':
{
// unicode escape codes are at least 3 characters including braces
if (i + 3 > size)
return false;
if (data[i] != '{')
return false;
i++;
if (data[i] == '}')
return false;
unsigned int code = 0;
for (int j = 0; j < 16; ++j)
{
if (i == size)
return false;
char ch = data[i];
if (ch == '}')
break;
if (!isHexDigit(ch))
return false;
// use or trick to convert to lower case
code = 16 * code + (isDigit(ch) ? ch - '0' : (ch | ' ') - 'a' + 10);
i++;
}
if (i == size || data[i] != '}')
return false;
i++;
size_t utf8 = toUtf8(&data[write], code);
if (utf8 == 0)
return false;
write += utf8;
break;
}
default:
{
if (isDigit(escape))
{
unsigned int code = escape - '0';
for (int j = 0; j < 2; ++j)
{
if (i == size || !isDigit(data[i]))
break;
code = 10 * code + (data[i] - '0');
i++;
}
if (code > UCHAR_MAX)
return false;
data[write++] = char(code);
}
else
{
data[write++] = unescape(escape);
}
}
}
}
LUAU_ASSERT(write <= size);
data.resize(write);
return true;
}
void Lexer::fixupMultilineString(std::string& data)
{
if (data.empty())
return;
// Lua rules for multiline strings are as follows:
// - standalone \r, \r\n, \n\r and \n are all considered newlines
// - first newline in the multiline string is skipped
// - all other newlines are normalized to \n
// Since our lexer just treats \n as newlines, we apply a simplified set of rules that is sufficient to get normalized newlines for Windows/Unix:
// - \r\n and \n are considered newlines
// - first newline is skipped
// - newlines are normalized to \n
// This makes the string parsing behavior consistent with general lexing behavior - a standalone \r isn't considered a new line from the line
// tracking perspective
const char* src = data.c_str();
char* dst = &data[0];
// skip leading newline
if (src[0] == '\r' && src[1] == '\n')
{
src += 2;
}
else if (src[0] == '\n')
{
src += 1;
}
// parse the rest of the string, converting newlines as we go
while (*src)
{
if (src[0] == '\r' && src[1] == '\n')
{
*dst++ = '\n';
src += 2;
}
else // note, this handles \n by just writing it without changes
{
*dst++ = *src;
src += 1;
}
}
data.resize(dst - &data[0]);
}
} // namespace Luau