From a483888b51e5431f9c9a9dffd558e5fbcc7d4f79 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Mon, 18 Mar 2024 15:53:20 +0100 Subject: [PATCH] lexer: allow TABS for indentation Allow TAB character to be used for indentation. This is useful to have TAB character used as indentation, especially when parts of build.ninja are hand-written as HEREDOCs in otherwise TAB-indented file (either mandated by style for other part of project, or required by language itself). Changing lexer is easy thanks to the use of re2c, syntax is perhaps a bit too permissive now, but that is job of the parser to reject use of mixed indentation. Let's stop complaining that: ninja: error: build.ninja:3: expected 'command =' line when it is exactly: command = cc $cflags -c $in -o $out Closes #1598 Signed-of-by: Przemek Kitszel --- src/lexer.cc | 70 ++++++++++++++++++++++++----------------------- src/lexer.in.cc | 21 ++++++-------- src/lexer_test.cc | 27 +++++++++++++----- 3 files changed, 65 insertions(+), 53 deletions(-) diff --git a/src/lexer.cc b/src/lexer.cc index e5729f00a0..efca1be1c3 100644 --- a/src/lexer.cc +++ b/src/lexer.cc @@ -105,12 +105,9 @@ const char* Lexer::TokenErrorHint(Token expected) { string Lexer::DescribeLastError() { if (last_token_) { - switch (last_token_[0]) { - case '\t': - return "tabs are not allowed, use spaces"; - } + return "lexing error <"+string(last_token_)+">"; } - return "lexing error"; + return "lexing error (EOF?)"; } void Lexer::UnreadToken() { @@ -130,7 +127,7 @@ Lexer::Token Lexer::ReadToken() { unsigned int yyaccept = 0; static const unsigned char yybm[] = { 0, 128, 128, 128, 128, 128, 128, 128, - 128, 128, 0, 128, 128, 128, 128, 128, + 128, 160, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 160, 128, 128, 128, 128, 128, 128, 128, @@ -164,16 +161,17 @@ Lexer::Token Lexer::ReadToken() { }; yych = *p; if (yybm[0+yych] & 32) { - goto yy9; + goto yy6; } if (yych <= '^') { if (yych <= ',') { if (yych <= '\f') { if (yych <= 0x00) goto yy2; - if (yych == '\n') goto yy6; + if (yych <= 0x08) goto yy4; + if (yych <= '\n') goto yy9; goto yy4; } else { - if (yych <= '\r') goto yy8; + if (yych <= '\r') goto yy11; if (yych == '#') goto yy12; goto yy4; } @@ -228,31 +226,32 @@ Lexer::Token Lexer::ReadToken() { yy5: { token = ERROR; break; } yy6: - ++p; - { token = NEWLINE; break; } -yy8: - yych = *++p; - if (yych == '\n') goto yy28; - goto yy5; -yy9: yyaccept = 0; yych = *(q = ++p); if (yybm[0+yych] & 32) { - goto yy9; + goto yy6; } if (yych <= '\f') { - if (yych == '\n') goto yy6; + if (yych <= 0x08) goto yy8; + if (yych <= '\n') goto yy9; } else { - if (yych <= '\r') goto yy30; - if (yych == '#') goto yy32; + if (yych <= '\r') goto yy28; + if (yych == '#') goto yy30; } -yy11: +yy8: { token = INDENT; break; } +yy9: + ++p; + { token = NEWLINE; break; } +yy11: + yych = *++p; + if (yych == '\n') goto yy32; + goto yy5; yy12: yyaccept = 1; yych = *(q = ++p); if (yych <= 0x00) goto yy5; - goto yy33; + goto yy31; yy13: yych = *++p; yy14: @@ -296,25 +295,27 @@ Lexer::Token Lexer::ReadToken() { if (yych == '|') goto yy44; { token = PIPE; break; } yy28: - ++p; - { token = NEWLINE; break; } -yy30: yych = *++p; - if (yych == '\n') goto yy28; -yy31: + if (yych == '\n') goto yy32; +yy29: p = q; if (yyaccept == 0) { - goto yy11; + goto yy8; } else { goto yy5; } -yy32: +yy30: yych = *++p; -yy33: +yy31: if (yybm[0+yych] & 128) { - goto yy32; + goto yy30; } - if (yych <= 0x00) goto yy31; + if (yych <= 0x00) goto yy29; + goto yy34; +yy32: + ++p; + { token = NEWLINE; break; } +yy34: ++p; { continue; } yy36: @@ -478,7 +479,7 @@ void Lexer::EatWhitespace() { unsigned char yych; static const unsigned char yybm[] = { 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, + 0, 128, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0, 0, @@ -631,7 +632,7 @@ bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { unsigned char yych; static const unsigned char yybm[] = { 0, 16, 16, 16, 16, 16, 16, 16, - 16, 16, 0, 16, 16, 0, 16, 16, + 16, 48, 0, 16, 16, 0, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 32, 16, 16, 16, 0, 16, 16, 16, @@ -797,6 +798,7 @@ bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { goto yy113; yy128: yych = *++p; + if (yych == '\t') goto yy128; if (yych == ' ') goto yy128; { continue; diff --git a/src/lexer.in.cc b/src/lexer.in.cc index 6f1d8e7937..4dc3cdda18 100644 --- a/src/lexer.in.cc +++ b/src/lexer.in.cc @@ -104,12 +104,9 @@ const char* Lexer::TokenErrorHint(Token expected) { string Lexer::DescribeLastError() { if (last_token_) { - switch (last_token_[0]) { - case '\t': - return "tabs are not allowed, use spaces"; - } + return "lexing error <"+string(last_token_)+">"; } - return "lexing error"; + return "lexing error (EOF?)"; } void Lexer::UnreadToken() { @@ -133,10 +130,10 @@ Lexer::Token Lexer::ReadToken() { simple_varname = [a-zA-Z0-9_-]+; varname = [a-zA-Z0-9_.-]+; - [ ]*"#"[^\000\n]*"\n" { continue; } - [ ]*"\r\n" { token = NEWLINE; break; } - [ ]*"\n" { token = NEWLINE; break; } - [ ]+ { token = INDENT; break; } + [ \t]*"#"[^\000\n]*"\n" { continue; } + [ \t]*"\r\n" { token = NEWLINE; break; } + [ \t]*"\n" { token = NEWLINE; break; } + [ \t]+ { token = INDENT; break; } "build" { token = BUILD; break; } "pool" { token = POOL; break; } "rule" { token = RULE; break; } @@ -175,7 +172,7 @@ void Lexer::EatWhitespace() { for (;;) { ofs_ = p; /*!re2c - [ ]+ { continue; } + [ \t]+ { continue; } "$\r\n" { continue; } "$\n" { continue; } nul { break; } @@ -241,10 +238,10 @@ bool Lexer::ReadEvalString(EvalString* eval, bool path, string* err) { eval->AddText(StringPiece(" ", 1)); continue; } - "$\r\n"[ ]* { + "$\r\n"[ \t]* { continue; } - "$\n"[ ]* { + "$\n"[ \t]* { continue; } "${"varname"}" { diff --git a/src/lexer_test.cc b/src/lexer_test.cc index 26dc67f1e0..fd66be96bf 100644 --- a/src/lexer_test.cc +++ b/src/lexer_test.cc @@ -97,11 +97,24 @@ TEST(Lexer, CommentEOF) { } TEST(Lexer, Tabs) { - // Verify we print a useful error on a disallowed character. - Lexer lexer(" \tfoobar"); - Lexer::Token token = lexer.ReadToken(); - EXPECT_EQ(Lexer::INDENT, token); - token = lexer.ReadToken(); - EXPECT_EQ(Lexer::ERROR, token); - EXPECT_EQ("tabs are not allowed, use spaces", lexer.DescribeLastError()); + Lexer lexer("rule foo\n" + "\tcommand = foobin $in"); + + EXPECT_EQ_TOK(Lexer::RULE, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::IDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::NEWLINE, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::INDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::IDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::EQUALS, lexer.ReadToken()); +} + +TEST(Lexer, TabsInVars) { + Lexer lexer("cflags =\n" + "\t-std=c11"); + + EXPECT_EQ_TOK(Lexer::IDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::EQUALS, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::NEWLINE, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::INDENT, lexer.ReadToken()); + EXPECT_EQ_TOK(Lexer::IDENT, lexer.ReadToken()); }