Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

parser (feature): Parse indents #422

Merged
merged 6 commits into from
May 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion SCALA_VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3.3.2
3.3.3
1 change: 1 addition & 0 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ThisBuild / resolvers ++= Resolver.sonatypeOssRepos("snapshots")
Global / onChangedBuildSource := ReloadOnSourceChanges

val buildSettings = Seq[Setting[?]](
scalaVersion := SCALA_3,
organization := "org.wvlet.lang",
description := "A framework for building functional data flows",
crossPaths := true,
Expand Down
180 changes: 114 additions & 66 deletions wvlet-lang/src/main/scala/wvlet/lang/compiler/parser/Token.scala
Original file line number Diff line number Diff line change
Expand Up @@ -13,72 +13,120 @@
*/
package wvlet.lang.compiler.parser

enum Token(val str: String):
case EMPTY extends Token("<empty>")
case ERROR extends Token("<erroneous token>")
case EOF extends Token("<eof>")
case NEWLINE extends Token("<newline>")
case INDENT extends Token(" ")

case COLON extends Token(":")
case COMMA extends Token(",")

case DOT extends Token(".")
case DOUBLE_QUOTE extends Token("\"")
case SINGLE_QUOTE extends Token("'")

case L_PAREN extends Token("(")
case R_PAREN extends Token(")")
case L_BRACE extends Token("{")
case R_BRACE extends Token("}")
case L_BRACKET extends Token("[")
case R_BRACKET extends Token("]")

case EQ extends Token("=")
case NEQ extends Token("!=")
case LT extends Token("<")
case GT extends Token(">")
case LTEQ extends Token("<=")
case GTEQ extends Token(">=")

case IN extends Token("in")

case DEF extends Token("def")
case SCHEMA extends Token("schema")
case WITH extends Token("with")

case INTEGER_LITERAL extends Token("<integer literal>")
case DECIMAL_LITERAL extends Token("<decimal literal>")
case EXP_LITERAL extends Token("<exp literal>")
case LONG_LITERAL extends Token("<long literal>")
case FLOAT_LITERAL extends Token("<float literal>")
case DOUBLE_LITERAL extends Token("<double literal>")
case STRING_LITERAL extends Token("<string literal>")

case NULL extends Token("null")
case TRUE extends Token("true")
case FALSE extends Token("false")

case IDENTIFIER extends Token("<identifier>")
case QUOTED_IDENTIFIER extends Token("<quoted identifier>")

case FOR extends Token("for")
case LET extends Token("let")
case WHERE extends Token("where")
case GROUP_BY extends Token("group by")
case HAVING extends Token("having")
case RETURN extends Token("return")
case ORDER_BY extends Token("order by")

case RUN extends Token("run")
case EXPORT extends Token("export")

case IF extends Token("if")
enum TokenType:
case Control, Literal, Identifier, Op, Keyword

import TokenType.*

enum Token(val tokenType: TokenType, val str: String):
// special tokens
case EMPTY extends Token(Control, "<empty>")
case ERROR extends Token(Control, "<erroneous token>")
case EOF extends Token(Control, "<eof>")
case NEWLINE extends Token(Control, "<newline>")

// Literals
case INTEGER_LITERAL extends Token(Literal, "<integer literal>")
case DECIMAL_LITERAL extends Token(Literal, "<decimal literal>")
case EXP_LITERAL extends Token(Literal, "<exp literal>")
case LONG_LITERAL extends Token(Literal, "<long literal>")
case FLOAT_LITERAL extends Token(Literal, "<float literal>")
case DOUBLE_LITERAL extends Token(Literal, "<double literal>")
case STRING_LITERAL extends Token(Literal, "<string literal>")
// For interpolated string parts
case STRING_PART extends Token(Literal, "<string part>")

// Identifiers
case IDENTIFIER extends Token(Identifier, "<identifier>")
// Identifier wrapped in backquotes `....`
case BACKQUOTED_IDENTIFIER extends Token(Identifier, "<quoted identifier>")

// Parentheses
case L_PAREN extends Token(Op, "(")
case R_PAREN extends Token(Op, ")")
case L_BRACE extends Token(Op, "{")
case R_BRACE extends Token(Op, "}")
case L_BRACKET extends Token(Op, "[")
case R_BRACKET extends Token(Op, "]")
case INDENT extends Token(Control, "<indent>")
case OUTDENT extends Token(Control, "<outdent>")

// Special symbols
case COLON extends Token(Op, ":")
case COMMA extends Token(Op, ",")
case DOT extends Token(Op, ".")
case UNDERSCORE extends Token(Op, "_")
case AT extends Token(Op, "@")
case DOLLAR extends Token(Op, "$")
case STAR extends Token(Op, "*")
case QUESTION extends Token(Op, "?")

case L_ARROW extends Token(Op, "<-")
case R_ARROW extends Token(Op, "->")
case R_DOUBLE_ARROW extends Token(Op, "=>")

case SINGLE_QUOTE extends Token(Op, "'")
case DOUBLE_QUOTE extends Token(Op, "\"")

// Special keywords
case EQ extends Token(Op, "=")
case NEQ extends Token(Op, "!=")
case LT extends Token(Op, "<")
case GT extends Token(Op, ">")
case LTEQ extends Token(Op, "<=")
case GTEQ extends Token(Op, ">=")

case PLUS extends Token(Op, "+")
case MINUS extends Token(Op, "-")
case ASTERISK extends Token(Op, "*")
case DIV extends Token(Op, "/")
case MOD extends Token(Op, "%")

case AMP extends Token(Op, "&")
case PIPE extends Token(Op, "|")

case HASH extends Token(Op, "#")

// literal keywords
case NULL extends Token(Keyword, "null")
case TRUE extends Token(Keyword, "true")
case FALSE extends Token(Keyword, "false")

// Alphabectic keywords
case DEF extends Token(Keyword, "def")
case SCHEMA extends Token(Keyword, "schema")
case TYPE extends Token(Keyword, "type")
case WITH extends Token(Keyword, "with")

case IN extends Token(Keyword, "in")

case SELECT extends Token(Keyword, "select")
case FOR extends Token(Keyword, "for")
case LET extends Token(Keyword, "let")
case WHERE extends Token(Keyword, "where")
case GROUP_BY extends Token(Keyword, "group by")
case HAVING extends Token(Keyword, "having")
case ORDER_BY extends Token(Keyword, "order by")
case JOIN extends Token(Keyword, "join")

case RUN extends Token(Keyword, "run")
case IMPORT extends Token(Keyword, "import")
case EXPORT extends Token(Keyword, "export")

case IF extends Token(Keyword, "if")
case THEN extends Token(Keyword, "then")
case ELSE extends Token(Keyword, "else")
case END extends Token(Keyword, "end")

case AND extends Token(Keyword, "and")
case OR extends Token(Keyword, "or")
case NOT extends Token(Keyword, "not")

object Tokens:
import Token.*
val keywords =
Seq(NULL, TRUE, FALSE, DEF, SCHEMA, WITH, FOR, LET, WHERE, GROUP_BY, HAVING, RETURN, ORDER_BY, RUN, EXPORT, IF)
val symbols =
Seq(COLON, COMMA, DOT, DOUBLE_QUOTE, SINGLE_QUOTE, L_PAREN, R_PAREN, L_BRACE, R_BRACE, L_BRACKET, R_BRACKET, EQ, IN)
val allKeywords = keywords ++ symbols
val keywords = Token.values.filter(_.tokenType == Keyword).toSeq
val specialSymbols = Token.values.filter(_.tokenType == Op).toSeq

val allKeywords = keywords ++ specialSymbols

val keywordTable = allKeywords.map(x => x.str -> x).toMap
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ class TokenBuffer(initialSize: Int = 1024) extends LogSupport:
def append(ch: Char): Unit =
if len == buf.length then
// Double the buffer size
val buf2 = new Array[Char](buf.length * 2)
Array.copy(buf, 0, buf2, 0, len)
buf = buf2
val newBuffer = new Array[Char](buf.length * 2)
Array.copy(buf, 0, newBuffer, 0, len)
buf = newBuffer
buf(len) = ch
len += 1
end append

def chars = buf
def length = len
def isEmpty: Boolean = len == 0
def nonEmpty: Boolean = !isEmpty
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@
package wvlet.lang.compiler.parser

import wvlet.lang.model.expression.Expression
import wvlet.lang.model.expression.Expression.{ConditionalExpression, Identifier, QName, ReturnItem, StringLiteral}
import wvlet.lang.model.expression.Expression.{ConditionalExpression, Identifier, QName, SelectItem, StringLiteral}
import wvlet.lang.model.logical.LogicalPlan
import wvlet.lang.model.logical.LogicalPlan.{FLOWRQuery, ForItem, Return, SchemaDef, SchemaItem, Where}
import wvlet.lang.model.logical.LogicalPlan.{FLOWRQuery, ForItem, Select, SchemaDef, SchemaItem, Where}
import WvletParser.EOFToken
import wvlet.log.LogSupport

Expand Down Expand Up @@ -63,12 +63,12 @@ class WvletParser(tokenScanner: TokenScanner) extends LogSupport:
currentToken.token match
case Token.FOR =>
parseFLOWRQuery
case Token.RETURN =>
// return-only plan
debug(s"return: ${currentToken}")
case Token.SELECT =>
// select-only plan
debug(s"select: ${currentToken}")
val sourceLocation = currentToken.getSourceLocation
val returnClause = parseReturn
FLOWRQuery(forItems = Seq.empty, returnClause = Some(returnClause))(sourceLocation)
val selectClause = parseSelect
FLOWRQuery(forItems = Seq.empty, selectClause = Some(selectClause))(sourceLocation)
case Token.SCHEMA =>
parseSchema
case Token.EOF =>
Expand All @@ -87,19 +87,19 @@ class WvletParser(tokenScanner: TokenScanner) extends LogSupport:

// TODO parse group by, join, etc.
var whereClause: Option[Where] = None
var returnClause: Option[Return] = None
var selectClause: Option[Select] = None
next.token match
case Token.WHERE =>
whereClause = Some(parseWhere)
peekNextToken.token match
case Token.RETURN =>
returnClause = Some(parseReturn)
case Token.SELECT =>
selectClause = Some(parseSelect)
case _ =>

case Token.RETURN =>
returnClause = Some(parseReturn)
case Token.SELECT =>
selectClause = Some(parseSelect)
case _ =>
FLOWRQuery(forItems = forItems, whereClause, returnClause)(currentToken.getSourceLocation)
FLOWRQuery(forItems = forItems, whereClause, selectClause)(currentToken.getSourceLocation)
case _ =>
null

Expand Down Expand Up @@ -129,28 +129,28 @@ class WvletParser(tokenScanner: TokenScanner) extends LogSupport:
Where(expr)(currentToken.getSourceLocation)
else parseError(currentToken, Token.WHERE)

private def parseReturn: Return =
private def parseSelect: Select =
val currentToken = peekNextToken
if currentToken.token == Token.RETURN then
if currentToken.token == Token.SELECT then
nextToken
val returnItems = parseReturnItems
Return(returnItems)(currentToken.getSourceLocation)
else parseError(currentToken, Token.RETURN)
val selectItems = parseSelectItems
Select(selectItems)(currentToken.getSourceLocation)
else parseError(currentToken, Token.SELECT)

private def parseReturnItems: Seq[ReturnItem] =
val items = Seq.newBuilder[ReturnItem]
private def parseSelectItems: Seq[SelectItem] =
val items = Seq.newBuilder[SelectItem]

@tailrec
def loop: Unit =
val ri = parseReturnItem
val ri = parseSelectItem
items += ri
if peekNextToken.token == Token.COMMA then
nextToken
loop
loop
items.result()

private def parseReturnItem: ReturnItem =
private def parseSelectItem: SelectItem =
val currentToken = peekNextToken
currentToken.token match
case Token.IDENTIFIER =>
Expand All @@ -159,12 +159,12 @@ class WvletParser(tokenScanner: TokenScanner) extends LogSupport:
case Token.COLON =>
nextToken
val expr = parseExpression
ReturnItem(Some(qName), expr)
SelectItem(Some(qName), expr)
case _ =>
ReturnItem(None, qName)
SelectItem(None, qName)
case _ =>
val expr = parseExpression
ReturnItem(None, expr)
SelectItem(None, expr)

private def parseSchema: SchemaDef =
val schemaLoc = peekNextToken.getSourceLocation
Expand Down
Loading
Loading