diff --git a/Ast/include/Luau/Lexer.h b/Ast/include/Luau/Lexer.h index 3d93cf75..3cc7f453 100644 --- a/Ast/include/Luau/Lexer.h +++ b/Ast/include/Luau/Lexer.h @@ -53,6 +53,7 @@ struct Lexeme Comment, BlockComment, + Whitespace, Attribute, @@ -100,7 +101,7 @@ private: public: union { - const char* data; // String, Number, Comment + const char* data; // String, Number, Comment, Whitespace const char* name; // Name unsigned int codepoint; // BrokenUnicode }; @@ -155,7 +156,7 @@ class Lexer public: Lexer(const char* buffer, std::size_t bufferSize, AstNameTable& names, Position startPosition = {0, 0}); - void setSkipComments(bool skip); + void setSkipTrivia(bool skip); void setReadNames(bool read); const Location& previousLocation() const @@ -164,7 +165,7 @@ public: } const Lexeme& next(); - const Lexeme& next(bool skipComments, bool updatePrevLocation); + const Lexeme& next(bool skipTrivia, bool updatePrevLocation); void nextline(); Lexeme lookahead(); @@ -227,7 +228,7 @@ private: AstNameTable& names; - bool skipComments; + bool skipTrivia; bool readNames; enum class BraceType diff --git a/Ast/src/Lexer.cpp b/Ast/src/Lexer.cpp index 86b44044..e9a82f8f 100644 --- a/Ast/src/Lexer.cpp +++ b/Ast/src/Lexer.cpp @@ -9,6 +9,7 @@ #include LUAU_FASTFLAGVARIABLE(LexerResumesFromPosition2) +LUAU_FASTFLAGVARIABLE(LuauLexerTokenizesWhitespace) namespace Luau { @@ -36,7 +37,7 @@ Lexeme::Lexeme(const Location& location, Type type, const char* data, size_t siz { LUAU_ASSERT( type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd || - type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment + type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace ); } @@ -53,7 +54,7 @@ unsigned int Lexeme::getLength() const { LUAU_ASSERT( type == RawString || type == QuotedString || type == InterpStringBegin || type == InterpStringMid || type == InterpStringEnd || - type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment + type == InterpStringSimple || type == BrokenInterpDoubleBrace || type == Number || type == Comment || type == BlockComment || type == Whitespace ); return length; @@ -315,14 +316,14 @@ Lexer::Lexer(const char* buffer, size_t bufferSize, AstNameTable& names, Positio Lexeme::Eof ) , names(names) - , skipComments(false) + , skipTrivia(false) , readNames(true) { } -void Lexer::setSkipComments(bool skip) +void Lexer::setSkipTrivia(bool skip) { - skipComments = skip; + skipTrivia = skip; } void Lexer::setReadNames(bool read) @@ -332,24 +333,27 @@ void Lexer::setReadNames(bool read) const Lexeme& Lexer::next() { - return next(this->skipComments, true); + return next(this->skipTrivia, true); } -const Lexeme& Lexer::next(bool skipComments, bool updatePrevLocation) +const Lexeme& Lexer::next(bool skipTrivia, bool updatePrevLocation) { - // in skipComments mode we reject valid comments + // in skipTrivia mode we reject valid comments do { - // consume whitespace before the token - while (isSpace(peekch())) - consumeAny(); + if (!FFlag::LuauLexerTokenizesWhitespace) + { + // consume whitespace before the token + while (isSpace(peekch())) + consumeAny(); + } if (updatePrevLocation) prevLocation = lexeme.location; lexeme = readNext(); updatePrevLocation = false; - } while (skipComments && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment)); + } while (skipTrivia && (lexeme.type == Lexeme::Comment || lexeme.type == Lexeme::BlockComment || lexeme.type == Lexeme::Whitespace)); return lexeme; } @@ -967,6 +971,15 @@ Lexeme Lexer::readNext() return Lexeme(Location(start, position()), name.second, name.first.value); } + else if (FFlag::LuauLexerTokenizesWhitespace && isSpace(peekch())) + { + size_t startOffset = offset; + + while (isSpace(peekch())) + consumeAny(); + + return Lexeme(Location(start, position()), Lexeme::Whitespace, &buffer[startOffset], offset - startOffset); + } else if (peekch() & 0x80) { return readUtf8Error(); diff --git a/Ast/src/Parser.cpp b/Ast/src/Parser.cpp index e821902e..c61e7fb4 100644 --- a/Ast/src/Parser.cpp +++ b/Ast/src/Parser.cpp @@ -208,7 +208,7 @@ Parser::Parser(const char* buffer, size_t bufferSize, AstNameTable& names, Alloc matchRecoveryStopOnToken[Lexeme::Type::Eof] = 1; // required for lookahead() to work across a comment boundary and for nextLexeme() to work when captureComments is false - lexer.setSkipComments(true); + lexer.setSkipTrivia(true); // read first lexeme (any hot comments get .header = true) LUAU_ASSERT(hotcommentHeader); @@ -3572,13 +3572,13 @@ AstTypeError* Parser::reportMissingTypeError(const Location& parseErrorLocation, void Parser::nextLexeme() { - Lexeme::Type type = lexer.next(/* skipComments= */ false, true).type; + Lexeme::Type type = lexer.next(/* skipTrivia= */ false, true).type; - while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment) + while (type == Lexeme::BrokenComment || type == Lexeme::Comment || type == Lexeme::BlockComment || type == Lexeme::Whitespace) { const Lexeme& lexeme = lexer.current(); - if (options.captureComments) + if (options.captureComments && type != Lexeme::Whitespace) commentLocations.push_back(Comment{lexeme.type, lexeme.location}); // Subtlety: Broken comments are weird because we record them as comments AND pass them to the parser as a lexeme. @@ -3598,7 +3598,7 @@ void Parser::nextLexeme() hotcomments.push_back({hotcommentHeader, lexeme.location, std::string(text + 1, text + end)}); } - type = lexer.next(/* skipComments= */ false, /* updatePrevLocation= */ false).type; + type = lexer.next(/* skipTrivia= */ false, /* updatePrevLocation= */ false).type; } } diff --git a/tests/Lexer.test.cpp b/tests/Lexer.test.cpp index e0716e4c..a477f37b 100644 --- a/tests/Lexer.test.cpp +++ b/tests/Lexer.test.cpp @@ -8,6 +8,8 @@ using namespace Luau; +LUAU_FASTFLAG(LuauLexerTokenizesWhitespace) + TEST_SUITE_BEGIN("LexerTests"); TEST_CASE("broken_string_works") @@ -38,7 +40,7 @@ TEST_CASE("broken_comment_kept") Luau::Allocator alloc; AstNameTable table(alloc); Lexer lexer(testInput.c_str(), testInput.size(), table); - lexer.setSkipComments(true); + lexer.setSkipTrivia(true); CHECK_EQ(lexer.next().type, Lexeme::Type::BrokenComment); } @@ -48,7 +50,7 @@ TEST_CASE("comment_skipped") Luau::Allocator alloc; AstNameTable table(alloc); Lexer lexer(testInput.c_str(), testInput.size(), table); - lexer.setSkipComments(true); + lexer.setSkipTrivia(true); CHECK_EQ(lexer.next().type, Lexeme::Type::Eof); } @@ -103,7 +105,7 @@ TEST_CASE("lookahead") Luau::Allocator alloc; AstNameTable table(alloc); Lexer lexer(testInput.c_str(), testInput.size(), table); - lexer.setSkipComments(true); + lexer.setSkipTrivia(true); lexer.next(); // must call next() before reading data from lexer at least once CHECK_EQ(lexer.current().type, Lexeme::Name); @@ -242,4 +244,48 @@ TEST_CASE("string_interpolation_with_unicode_escape") CHECK_EQ(lexer.next().type, Lexeme::Eof); } +TEST_CASE("lexer_tokenizes_whitespace") +{ + ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true}; + + const std::string testInput = "local x = 1"; + Luau::Allocator alloc; + AstNameTable table(alloc); + Lexer lexer(testInput.c_str(), testInput.size(), table); + + CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal); + CHECK_EQ(lexer.next().type, Lexeme::Whitespace); + CHECK_EQ(lexer.next().type, Lexeme::Name); + CHECK_EQ(lexer.next().type, Lexeme::Whitespace); + CHECK_EQ(lexer.next().type, '='); + CHECK_EQ(lexer.next().type, Lexeme::Whitespace); + CHECK_EQ(lexer.next().type, Lexeme::Number); + CHECK_EQ(lexer.next().type, Lexeme::Eof); +} + +TEST_CASE("lexer_tokenizes_multiline_whitespace") +{ + ScopedFastFlag sff{FFlag::LuauLexerTokenizesWhitespace, true}; + + const std::string testInput = R"(local x + + y = 2 + )"; + Luau::Allocator alloc; + AstNameTable table(alloc); + Lexer lexer(testInput.c_str(), testInput.size(), table); + + CHECK_EQ(lexer.next().type, Lexeme::ReservedLocal); + CHECK_EQ(lexer.next().type, Lexeme::Whitespace); + CHECK_EQ(lexer.next().type, Lexeme::Name); + CHECK_EQ(lexer.next().type, Lexeme::Whitespace); + CHECK_EQ(lexer.next().type, Lexeme::Name); + CHECK_EQ(lexer.next().type, Lexeme::Whitespace); + CHECK_EQ(lexer.next().type, '='); + CHECK_EQ(lexer.next().type, Lexeme::Whitespace); + CHECK_EQ(lexer.next().type, Lexeme::Number); + CHECK_EQ(lexer.next().type, Lexeme::Whitespace); + CHECK_EQ(lexer.next().type, Lexeme::Eof); +} + TEST_SUITE_END();