Tokenize whitespace in Lexer

This commit is contained in:
JohnnyMorganz 2024-12-16 20:00:37 +00:00
parent 2e6fdd90a0
commit 00a5e2f2e9
4 changed files with 84 additions and 24 deletions

View file

@ -53,6 +53,7 @@ struct Lexeme
Comment, Comment,
BlockComment, BlockComment,
Whitespace,
Attribute, Attribute,
@ -100,7 +101,7 @@ private:
public: public:
union union
{ {
const char* data; // String, Number, Comment const char* data; // String, Number, Comment, Whitespace
const char* name; // Name const char* name; // Name
unsigned int codepoint; // BrokenUnicode unsigned int codepoint; // BrokenUnicode
}; };
@ -155,7 +156,7 @@ class Lexer
public: public:
Lexer(const char* buffer, std::size_t bufferSize, AstNameTable& names, Position startPosition = {0, 0}); Lexer(const char* buffer, std::size_t bufferSize, AstNameTable& names, Position startPosition = {0, 0});
void setSkipComments(bool skip); void setSkipTrivia(bool skip);
void setReadNames(bool read); void setReadNames(bool read);
const Location& previousLocation() const const Location& previousLocation() const
@ -164,7 +165,7 @@ public:
} }
const Lexeme& next(); const Lexeme& next();
const Lexeme& next(bool skipComments, bool updatePrevLocation); const Lexeme& next(bool skipTrivia, bool updatePrevLocation);
void nextline(); void nextline();
Lexeme lookahead(); Lexeme lookahead();
@ -227,7 +228,7 @@ private:
AstNameTable& names; AstNameTable& names;
bool skipComments; bool skipTrivia;
bool readNames; bool readNames;
enum class BraceType enum class BraceType

View file

@ -9,6 +9,7 @@
#include <limits.h> #include <limits.h>
LUAU_FASTFLAGVARIABLE(LexerResumesFromPosition2) LUAU_FASTFLAGVARIABLE(LexerResumesFromPosition2)
LUAU_FASTFLAGVARIABLE(LuauLexerTokenizesWhitespace)
namespace Luau namespace Luau
{ {
@ -36,7 +37,7 @@ Lexeme::Lexeme(const Location& location, Type type, const char* data, size_t siz
{ {
LUAU_ASSERT( LUAU_ASSERT(
type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd || type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace
); );
} }
@ -53,7 +54,7 @@ unsigned int Lexeme::getLength() const
{ {
LUAU_ASSERT( LUAU_ASSERT(
type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd || type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd ||
type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace
); );
return length; return length;
@ -315,14 +316,14 @@ Lexer::Lexer(const char* buffer, size_t bufferSize, AstNameTable& names, Positio
Lexeme::Eof Lexeme::Eof
) )
, names(names) , names(names)
, skipComments(false) , skipTrivia(false)
, readNames(true) , readNames(true)
{ {
} }
void Lexer::setSkipComments(bool skip) void Lexer::setSkipTrivia(bool skip)
{ {
skipComments = skip; skipTrivia = skip;
} }
void Lexer::setReadNames(bool read) void Lexer::setReadNames(bool read)
@ -332,24 +333,27 @@ void Lexer::setReadNames(bool read)
const Lexeme& Lexer::next() const Lexeme& Lexer::next()
{ {
return next(this->skipComments, true); return next(this->skipTrivia, true);
} }
const Lexeme& Lexer::next(bool skipComments, bool updatePrevLocation) const Lexeme& Lexer::next(bool skipTrivia, bool updatePrevLocation)
{ {
// in skipComments mode we reject valid comments // in skipTrivia mode we reject valid comments
do do
{
if (!FFlag::LuauLexerTokenizesWhitespace)
{ {
// consume whitespace before the token // consume whitespace before the token
while (isSpace(peekch())) while (isSpace(peekch()))
consumeAny(); consumeAny();
}
if (updatePrevLocation) if (updatePrevLocation)
prevLocation = lexeme.location; prevLocation = lexeme.location;
lexeme = readNext(); lexeme = readNext();
updatePrevLocation = false; updatePrevLocation = false;
} while (skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment)); } while (skipTrivia && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment || lexeme.type == Lexeme::Whitespace));
return lexeme; return lexeme;
} }
@ -967,6 +971,15 @@ Lexeme Lexer::readNext()
return Lexeme(Location(start, position()), name.second, name.first.value); return Lexeme(Location(start, position()), name.second, name.first.value);
} }
else if (FFlag::LuauLexerTokenizesWhitespace && isSpace(peekch()))
{
size_t startOffset = offset;
while (isSpace(peekch()))
consumeAny();
return Lexeme(Location(start, position()), Lexeme::Whitespace, &buffer[startOffset], offset - startOffset);
}
else if (peekch() & 0x80) else if (peekch() & 0x80)
{ {
return readUtf8Error(); return readUtf8Error();

View file

@ -208,7 +208,7 @@ Parser::Parser(const char* buffer, size_t bufferSize, AstNameTable& names, Alloc
matchRecoveryStopOnToken[Lexeme::Type::Eof] = 1; matchRecoveryStopOnToken[Lexeme::Type::Eof] = 1;
// required for lookahead() to work across a comment boundary and for nextLexeme() to work when captureComments is false // required for lookahead() to work across a comment boundary and for nextLexeme() to work when captureComments is false
lexer.setSkipComments(true); lexer.setSkipTrivia(true);
// read first lexeme (any hot comments get .header = true) // read first lexeme (any hot comments get .header = true)
LUAU_ASSERT(hotcommentHeader); LUAU_ASSERT(hotcommentHeader);
@ -3572,13 +3572,13 @@ AstTypeError* Parser::reportMissingTypeError(const Location& parseErrorLocation,
void Parser::nextLexeme() void Parser::nextLexeme()
{ {
Lexeme::Type type = lexer.next(/* skipComments= */ false, true).type; Lexeme::Type type = lexer.next(/* skipTrivia= */ false, true).type;
while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment) while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment || type == Lexeme::Whitespace)
{ {
const Lexeme& lexeme = lexer.current(); const Lexeme& lexeme = lexer.current();
if (options.captureComments) if (options.captureComments && type != Lexeme::Whitespace)
commentLocations.push_back(Comment{lexeme.type, lexeme.location}); commentLocations.push_back(Comment{lexeme.type, lexeme.location});
// Subtlety: Broken comments are weird because we record them as comments AND pass them to the parser as a lexeme. // Subtlety: Broken comments are weird because we record them as comments AND pass them to the parser as a lexeme.
@ -3598,7 +3598,7 @@ void Parser::nextLexeme()
hotcomments.push_back({hotcommentHeader, lexeme.location, std::string(text + 1, text + end)}); hotcomments.push_back({hotcommentHeader, lexeme.location, std::string(text + 1, text + end)});
} }
type = lexer.next(/* skipComments= */ false, /* updatePrevLocation= */ false).type; type = lexer.next(/* skipTrivia= */ false, /* updatePrevLocation= */ false).type;
} }
} }

View file

@ -8,6 +8,8 @@
using namespace Luau; using namespace Luau;
LUAU_FASTFLAG(LuauLexerTokenizesWhitespace)
TEST_SUITE_BEGIN("LexerTests"); TEST_SUITE_BEGIN("LexerTests");
TEST_CASE("broken_string_works") TEST_CASE("broken_string_works")
@ -38,7 +40,7 @@ TEST_CASE("broken_comment_kept")
Luau::Allocator alloc; Luau::Allocator alloc;
AstNameTable table(alloc); AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table); Lexer lexer(testInput.c_str(), testInput.size(), table);
lexer.setSkipComments(true); lexer.setSkipTrivia(true);
CHECK_EQ(lexer.next().type, Lexeme::Type::BrokenComment); CHECK_EQ(lexer.next().type, Lexeme::Type::BrokenComment);
} }
@ -48,7 +50,7 @@ TEST_CASE("comment_skipped")
Luau::Allocator alloc; Luau::Allocator alloc;
AstNameTable table(alloc); AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table); Lexer lexer(testInput.c_str(), testInput.size(), table);
lexer.setSkipComments(true); lexer.setSkipTrivia(true);
CHECK_EQ(lexer.next().type, Lexeme::Type::Eof); CHECK_EQ(lexer.next().type, Lexeme::Type::Eof);
} }
@ -103,7 +105,7 @@ TEST_CASE("lookahead")
Luau::Allocator alloc; Luau::Allocator alloc;
AstNameTable table(alloc); AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table); Lexer lexer(testInput.c_str(), testInput.size(), table);
lexer.setSkipComments(true); lexer.setSkipTrivia(true);
lexer.next(); // must call next() before reading data from lexer at least once lexer.next(); // must call next() before reading data from lexer at least once
CHECK_EQ(lexer.current().type, Lexeme::Name); CHECK_EQ(lexer.current().type, Lexeme::Name);
@ -242,4 +244,48 @@ TEST_CASE("string_interpolation_with_unicode_escape")
CHECK_EQ(lexer.next().type, Lexeme::Eof); CHECK_EQ(lexer.next().type, Lexeme::Eof);
} }
TEST_CASE("lexer_tokenizes_whitespace")
{
ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true};
const std::string testInput = "local x = 1";
Luau::Allocator alloc;
AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table);
CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Name);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, '=');
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Number);
CHECK_EQ(lexer.next().type, Lexeme::Eof);
}
TEST_CASE("lexer_tokenizes_multiline_whitespace")
{
ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true};
const std::string testInput = R"(local x
y = 2
)";
Luau::Allocator alloc;
AstNameTable table(alloc);
Lexer lexer(testInput.c_str(), testInput.size(), table);
CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Name);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Name);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, '=');
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Number);
CHECK_EQ(lexer.next().type, Lexeme::Whitespace);
CHECK_EQ(lexer.next().type, Lexeme::Eof);
}
TEST_SUITE_END(); TEST_SUITE_END();