From 37ddf2ea321570d80a192e5d4a76ddb24417ce13 Mon Sep 17 00:00:00 2001 From: Andy Maloney Date: Sat, 20 Jan 2024 12:10:30 -0500 Subject: [PATCH] Reduce scope of keywords during lexing This means keywords in one section (i.e. "name") may be used as ids in other sections. --- amod/amod_config_test.go | 13 +++++ amod/lex.go | 116 +++++++++++++++++++++++++++++---------- amod/lex_test.go | 9 +++ amod/parse.go | 38 ++++++------- 4 files changed, 129 insertions(+), 47 deletions(-) diff --git a/amod/amod_config_test.go b/amod/amod_config_test.go index 5b02e601..0ebde72e 100644 --- a/amod/amod_config_test.go +++ b/amod/amod_config_test.go @@ -457,3 +457,16 @@ func Example_proceduralFieldUnrecognized() { // Output: // ERROR: unrecognized option "foo" in procedural config (line 6, col 15) } + +// Tests that we can use a keyword from one section as an id in another +func Example_keywordInDifferentSection() { + generateToStdout(` + ~~ model ~~ + name: Test + ~~ config ~~ + chunks { [name: first last] } + ~~ init ~~ + ~~ productions ~~`) + + // Output: +} diff --git a/amod/lex.go b/amod/lex.go index 09d3fa00..d20fc70b 100644 --- a/amod/lex.go +++ b/amod/lex.go @@ -8,6 +8,7 @@ package amod import ( "fmt" "io" + "slices" "strings" "unicode" "unicode/utf8" @@ -99,18 +100,31 @@ type lexeme struct { pos int // position within the line } +// sectionType is used to keep track of what section we are lexing +// We use this to limit the scope of keywords. +type sectionType int + +const ( + sectionModel sectionType = iota + sectionConfig + sectionInit + sectionProduction +) + // lexer_amod tracks our lexing and provides a channel to emit lexemes type lexer_amod struct { name string // used only for error reports input string // the string being scanned. line int // the line number lastNewlinePos int - start int // start position of this lexeme (offset from beginning of file) - pos int // current position in the input (offset from beginning of file) - width int // width of last rune read from input - lexemes chan lexeme // channel of scanned lexemes - keywords map[string]bool // used to lookup identifier to see if they are keywords - inPattern bool // state: a pattern - delimited by [] is lexed specially + start int // start position of this lexeme (offset from beginning of file) + pos int // current position in the input (offset from beginning of file) + width int // width of last rune read from input + lexemes chan lexeme // channel of scanned lexemes + + inSectionHeader bool // state: switch currentSection based on ~~ section headers + currentSection sectionType // which section are we lexing? used to switch out keywords + inPattern bool // state: a pattern - delimited by [] is lexed specially } // stateFn is used to move through the lexing states @@ -122,26 +136,40 @@ const ( commentDelim = "//" ) -var keywords []string = []string{ +// keywordsModel are only keywords for the model section +var keywordsModel []string = []string{ + "authors", + "description", + "examples", + "name", +} + +// keywordsModel are only keywords for the config section +var keywordsConfig []string = []string{ + "chunks", + "gactar", + "modules", +} + +// keywordsModel are only keywords for the init section +var keywordsInit []string = []string{ + "similar", +} + +// keywordsModel are only keywords for the productions section +var keywordsProductions []string = []string{ "and", "any", - "authors", "buffer_state", - "chunks", "clear", "description", "do", - "examples", - "gactar", "match", "module_state", - "modules", - "name", "nil", "print", "recall", "set", - "similar", "stop", "to", "when", @@ -185,17 +213,14 @@ func lex(filename string, data string) *lexer_amod { cleanData(&data) l := &lexer_amod{ - name: filename, - input: data, - line: 1, - lastNewlinePos: 1, // start @ 1 so first line gets 0 (see emit()) - lexemes: make(chan lexeme), - keywords: make(map[string]bool), - inPattern: false, - } - - for _, v := range keywords { - l.keywords[v] = true + name: filename, + input: data, + line: 1, + lastNewlinePos: 1, // start @ 1 so first line gets 0 (see emit()) + lexemes: make(chan lexeme), + currentSection: sectionModel, + inSectionHeader: false, + inPattern: false, } go l.run() @@ -252,9 +277,20 @@ func (l *lexer_amod) next() rune { return r } +// lookupKeyword looks up "id" to see if it is a keyword based on which section we are lexing func (l *lexer_amod) lookupKeyword(id string) bool { - v, ok := l.keywords[id] - return v && ok + switch l.currentSection { + case sectionModel: + return slices.Contains(keywordsModel, id) + case sectionConfig: + return slices.Contains(keywordsConfig, id) + case sectionInit: + return slices.Contains(keywordsInit, id) + case sectionProduction: + return slices.Contains(keywordsProductions, id) + } + + return false } // skip over the pending input before this point @@ -428,6 +464,7 @@ func lexStart(l *lexer_amod) stateFn { if l.nextIs('~') { l.next() l.emit(lexemeSectionDelim) + l.inSectionHeader = !l.inSectionHeader } else { l.emit(lexemeChar) } @@ -495,9 +532,32 @@ func lexIdentifier(l *lexer_amod) stateFn { l.next() } + id := l.input[l.start:l.pos] + isKeyword := false + + // If we are in a section header, then change our current section + if l.inSectionHeader { + switch id { + case "model": + l.currentSection = sectionModel + case "config": + l.currentSection = sectionConfig + case "init": + l.currentSection = sectionInit + case "productions": + l.currentSection = sectionProduction + default: + return l.errorf("unrecognized section") + } + + // these are keywords in this context + isKeyword = true + } else { + isKeyword = l.lookupKeyword(id) + } + // Perhaps not the best way to do this. // I'm sure there's a char-by-char way we could implement which would be faster. - isKeyword := l.lookupKeyword(l.input[l.start:l.pos]) switch { case isKeyword: l.emit(lexemeKeyword) diff --git a/amod/lex_test.go b/amod/lex_test.go index cf045ae9..b409911b 100644 --- a/amod/lex_test.go +++ b/amod/lex_test.go @@ -45,6 +45,15 @@ func TestInvalidSection(t *testing.T) { if token.Type != lexer.TokenType(lexemeSectionDelim) { t.Errorf("expected to lex '%s' as section delimiter (%d) - got type %d", token.Value, lexemeSectionDelim, token.Type) } + + expected := "ERROR on line 1 at position 9: unrecognized section" + + token, err = l.Next() + if err == nil { + t.Errorf("expected error: %q", expected) + } else if err.Error() != expected { + t.Errorf("expected error: %q but got %q", expected, err.Error()) + } } func TestUnterminatedQuote(t *testing.T) { diff --git a/amod/parse.go b/amod/parse.go index 721b2080..b962af63 100644 --- a/amod/parse.go +++ b/amod/parse.go @@ -25,26 +25,26 @@ import ( // paste in the generated EBNF above, click "Convert" and then click "View Diagram" type amodFile struct { - ModelHeader string `parser:"'~~':SectionDelim 'model' '~~':SectionDelim"` + ModelHeader string `parser:"'~~':SectionDelim 'model':Keyword '~~':SectionDelim"` Model *modelSection `parser:"@@"` - ConfigHeader string `parser:"'~~':SectionDelim 'config' '~~':SectionDelim"` + ConfigHeader string `parser:"'~~':SectionDelim 'config':Keyword '~~':SectionDelim"` Config *configSection `parser:"(@@)?"` - InitHeader string `parser:"'~~':SectionDelim 'init' '~~':SectionDelim"` + InitHeader string `parser:"'~~':SectionDelim 'init':Keyword '~~':SectionDelim"` Init *initSection `parser:"(@@)?"` - ProductionsHeader string `parser:"'~~':SectionDelim 'productions' '~~':SectionDelim"` + ProductionsHeader string `parser:"'~~':SectionDelim 'productions':Keyword '~~':SectionDelim"` Productions *productionSection `parser:"(@@)?"` Tokens []lexer.Token } type modelSection struct { - Name string `parser:"'name' ':' (@String|@Ident)"` - Description string `parser:"('description' ':' @String)?"` - Authors []string `parser:"('authors' '{' @String* '}')?"` - Examples []*pattern `parser:"('examples' '{' @@* '}')?"` + Name string `parser:"'name':Keyword ':' (@String|@Ident)"` + Description string `parser:"('description':Keyword ':' @String)?"` + Authors []string `parser:"('authors':Keyword '{' @String* '}')?"` + Examples []*pattern `parser:"('examples':Keyword '{' @@* '}')?"` Tokens []lexer.Token } @@ -165,7 +165,7 @@ type field struct { } type gactarConfig struct { - GactarFields []*field `parser:"'gactar' '{' @@* '}'"` + GactarFields []*field `parser:"'gactar':Keyword '{' @@* '}'"` Tokens []lexer.Token } @@ -178,7 +178,7 @@ type module struct { } type moduleConfig struct { - Modules []*module `parser:"'modules' '{' @@* '}'"` + Modules []*module `parser:"'modules':Keyword '{' @@* '}'"` Tokens []lexer.Token } @@ -193,7 +193,7 @@ type chunkDecl struct { } type chunkConfig struct { - ChunkDecls []*chunkDecl `parser:"'chunks' '{' @@* '}'"` + ChunkDecls []*chunkDecl `parser:"'chunks':Keyword '{' @@* '}'"` Tokens []lexer.Token } @@ -347,19 +347,19 @@ type matchItem struct { } type match struct { - Items []*matchItem `parser:"'match' '{' @@+ '}'"` + Items []*matchItem `parser:"'match':Keyword '{' @@+ '}'"` Tokens []lexer.Token } type clearStatement struct { - BufferNames []string `parser:"'clear' ( @Ident ','? )+"` + BufferNames []string `parser:"'clear':Keyword ( @Ident ','? )+"` Tokens []lexer.Token } type printStatement struct { - Args []*printArg `parser:"'print' ( @@ ','? )*"` + Args []*printArg `parser:"'print':Keyword ( @@ ','? )*"` Tokens []lexer.Token } @@ -381,17 +381,17 @@ type withClause struct { } type recallStatement struct { - Pattern *pattern `parser:"'recall' @@"` + Pattern *pattern `parser:"'recall':Keyword @@"` With *withClause `parser:"@@?"` Tokens []lexer.Token } type setStatement struct { - Set string `parser:"'set'"` // not used, but must be visible for parse to work + Set string `parser:"'set':Keyword"` // not used, but must be visible for parse to work BufferRef bufferRef `parser:"@@"` - To string `parser:"'to'"` // not used, but must be visible for parse to work + To string `parser:"'to':Keyword"` // not used, but must be visible for parse to work Value *setArg `parser:"( @@"` Pattern *pattern `parser:"| @@)"` @@ -415,7 +415,7 @@ type statement struct { } type do struct { - Do string `parser:"'do'"` // not used, but must be visible for parse to work + Do string `parser:"'do':Keyword"` // not used, but must be visible for parse to work Statements *[]*statement `parser:"'{' @@+ '}'"` Tokens []lexer.Token @@ -423,7 +423,7 @@ type do struct { type production struct { Name string `parser:"@Ident '{'"` - Description *string `parser:"('description' ':' @String)?"` + Description *string `parser:"('description':Keyword ':' @String)?"` Match *match `parser:"@@"` Do *do `parser:"@@"` End string `parser:"'}'"` // not used, but must be visible for parse to work