From 069aa56eecfea3c8db08574a31cb51bbf2c557a0 Mon Sep 17 00:00:00 2001 From: Ben Greenman Date: Tue, 28 Sep 2021 13:30:28 -0400 Subject: [PATCH] add marc-matcher from https://github.com/syntax-objects/Summer2021/issues/4 cc @hzafar --- index.scrbl | 1 + marc-matcher/marc-matcher-helpers.rkt | 40 +++++++ marc-matcher/marc-matcher-syntax-classes.rkt | 8 ++ marc-matcher/marc-matcher-test.rkt | 58 +++++++++ marc-matcher/marc-matcher.rkt | 15 +++ marc-matcher/marc-matcher.scrbl | 120 +++++++++++++++++++ 6 files changed, 242 insertions(+) create mode 100644 marc-matcher/marc-matcher-helpers.rkt create mode 100644 marc-matcher/marc-matcher-syntax-classes.rkt create mode 100644 marc-matcher/marc-matcher-test.rkt create mode 100644 marc-matcher/marc-matcher.rkt create mode 100644 marc-matcher/marc-matcher.scrbl diff --git a/index.scrbl b/index.scrbl index c84e575..9b3b312 100644 --- a/index.scrbl +++ b/index.scrbl @@ -31,6 +31,7 @@ @include-example{multi-check-true} @include-example{define-datum-literal-set} @include-example{rec-contract} +@include-example{marc-matcher} @include-example{struct-list} @include-example{syntax-class-contract} @include-example{except-in-quiet} diff --git a/marc-matcher/marc-matcher-helpers.rkt b/marc-matcher/marc-matcher-helpers.rkt new file mode 100644 index 0000000..86bd055 --- /dev/null +++ b/marc-matcher/marc-matcher-helpers.rkt @@ -0,0 +1,40 @@ +#lang racket + +(provide get-subfield-data + simplify-groups + (struct-out marc-subfield)) + +(struct marc-subfield (subtag data) #:transparent) + +(define (parse-data marc-str [sep "$"]) + (for/list ([sf (string-split marc-str sep)]) + (marc-subfield (substring sf 0 1) + (substring sf 1 (string-length sf))))) + +(define (filter-subfields groups subfields) + (define (helper group remaining acc) + (cond [(or (empty? group) (empty? remaining)) (values (reverse acc) remaining)] + [(equal? (first group) (marc-subfield-subtag (first remaining))) + (helper (rest group) (rest remaining) (cons (first remaining) acc))] + [else (helper group (rest remaining) acc)])) + (cond [(or (empty? groups) (empty? subfields)) '()] + [else + (let-values ([(result remaining) (helper (first groups) subfields '())]) + (cons result + (filter-subfields (rest groups) remaining)))])) + +(define (str->strlist str) + (map string (string->list str))) + +(define (simplify-groups groups) + (match groups + [(list (list (marc-subfield t data))) + (marc-subfield t data)] + [_ groups])) + +(define (get-subfield-data regexps subfield-str [sep "$"]) + (define subfields (parse-data subfield-str sep)) + (define subtags (string-join (map marc-subfield-subtag subfields) "")) + (define subfield-groups (map (λ (re) (map str->strlist (regexp-match* re subtags))) regexps)) + (for/list ([group subfield-groups]) + (filter-subfields group subfields))) diff --git a/marc-matcher/marc-matcher-syntax-classes.rkt b/marc-matcher/marc-matcher-syntax-classes.rkt new file mode 100644 index 0000000..b09cf78 --- /dev/null +++ b/marc-matcher/marc-matcher-syntax-classes.rkt @@ -0,0 +1,8 @@ +#lang racket +(provide marc-var-defn) + +(require syntax/parse) + +(define-syntax-class marc-var-defn + #:auto-nested-attributes + (pattern (re:regexp #:as name:id))) diff --git a/marc-matcher/marc-matcher-test.rkt b/marc-matcher/marc-matcher-test.rkt new file mode 100644 index 0000000..2b744a2 --- /dev/null +++ b/marc-matcher/marc-matcher-test.rkt @@ -0,0 +1,58 @@ +#lang racket +(module+ test + (require rackunit syntax-parse-example/marc-matcher/marc-matcher) + + (test-case "parse-264" + (define parse-264 + (marc-matcher ([#px"ab" #:as place-entity-groups] + [#px"c" #:as date]) + (for/list ([group place-entity-groups]) + (cons (marc-subfield-data date) (map marc-subfield-data group))))) + (check-equal? + (parse-264 "$aBoston :$bLee and Shepard, publishers ;$aNew York :$bLee, Shepard, and Dillingham,$c1872.") + '(("1872." "Boston :" "Lee and Shepard, publishers ;") + ("1872." "New York :" "Lee, Shepard, and Dillingham,")))) + + (test-case "table-of-contents" + (define matcher + (marc-matcher ([#px"tr?" #:as title-info-groups]) + (for ([group title-info-groups]) + (define title (first (map marc-subfield-data + (filter (λ (sf) (equal? "t" (marc-subfield-subtag sf))) group)))) + (define authors (map marc-subfield-data + (filter (λ (sf) (equal? "r" (marc-subfield-subtag sf))) group))) + (printf "Title: ~a~a~n~n" + (string-trim title #px"( /\\s*)|( --\\s*)|\\.") + (if (empty? authors) + "" + (string-append "\nAuthor: " + (string-trim (first authors) #px"( /\\s*)|( --\\s*)|\\."))))))) + (define data + '("$tCaveat Lector; or how I ransacked Wikipedias across the Multiverse soley " + "to amuse and edify readers -- $tMystery of the missing mothers / $rKristin King -- " + "$tSecrets of Flatland / $rAnne Toole -- $tSanyo TM-300 Home-Use Time Machine / " + "$rJeremy Sim -- $tElizabeth Burgoyne Corbett / $rL. Timmel Duchamp -- " + "$tBiographies.")) + (check-equal? + (with-output-to-string + (lambda () (matcher (string-join data "")))) + (string-join + '("Title: Caveat Lector; or how I ransacked Wikipedias across the Multiverse soley to amuse and edify readers" + "" + "Title: Mystery of the missing mothers" + "Author: Kristin King" + "" + "Title: Secrets of Flatland" + "Author: Anne Toole" + "" + "Title: Sanyo TM-300 Home-Use Time Machine" + "Author: Jeremy Sim" + "" + "Title: Elizabeth Burgoyne Corbett" + "Author: L. Timmel Duchamp" + "" + "Title: Biographies" + "" + "") + "\n"))) +) diff --git a/marc-matcher/marc-matcher.rkt b/marc-matcher/marc-matcher.rkt new file mode 100644 index 0000000..a298ba0 --- /dev/null +++ b/marc-matcher/marc-matcher.rkt @@ -0,0 +1,15 @@ +#lang racket +(provide marc-matcher (struct-out marc-subfield)) + +(require syntax/parse/define + syntax-parse-example/marc-matcher/marc-matcher-helpers + (for-syntax syntax-parse-example/marc-matcher/marc-matcher-syntax-classes)) + +(define-syntax (marc-matcher stx) + (syntax-parse stx + [(_ (var:marc-var-defn ...) body:expr ...) + (define params #'(var.name ...)) + (define regexps #'(var.re ...)) + #`(λ (input [sep "$"]) + (define args (get-subfield-data '#,regexps input sep)) + (apply (λ #,params (begin body ...)) (map simplify-groups args)))])) diff --git a/marc-matcher/marc-matcher.scrbl b/marc-matcher/marc-matcher.scrbl new file mode 100644 index 0000000..29e3ac7 --- /dev/null +++ b/marc-matcher/marc-matcher.scrbl @@ -0,0 +1,120 @@ +#lang syntax-parse-example +@require[ + (for-label racket/base racket/contract/base syntax/parse syntax-parse-example/marc-matcher/marc-matcher)] + +@(define marc-matcher-eval + (make-base-eval '(require racket/string racket/list syntax-parse-example/marc-matcher/marc-matcher))) + +@(define (subitem . elem*) (itemlist (apply item elem*))) + +@title{@tt{marc-matcher}} +@stxbee2021["hzafar" 4] + +@; ============================================================================= + +@defmodule[syntax-parse-example/marc-matcher/marc-matcher]{} + +This is a very domain-specific macro, developed for a particular bibliographic +metadata use-case. + +@defform[(marc-matcher ([re #:as name] ...) body ...)]{ + This macro aims to make it easier to do regex-like matching over a structured + bibliographic data format known as + @hyperlink["https://www.loc.gov/marc/bibliographic/" "MARC 21"]. + MARC records contain a sequence of fields whose data are string values that + look like this: + + @nested[#:style 'code-inset + @tt{$aCarroll, Lewis,$d1832-1898,$eauthor.}] + + In each field, individual subfields are separated using a separator character + (in this case @tt{$}); the character immediately following the separator is + called the subtag; and the substring up to the next separator or end-of-string + is the subfield data. So in the example above, there are three subfields, + @tt{$a}, @tt{$d}, and @tt{$e}, whose data are, respectively, @tt{Carroll, Lewis}, + @tt{1832-1898}, and @tt{author}. + + Parsing subfields out of this is often done using regular expressions, but it + gets really difficult when trying to deal with subfield repetitions. I'll use + @hyperlink["https://www.loc.gov/marc/bibliographic/bd264.html" "field 264"] + to illustrate. + This field mainly contains the following pieces of publication information: + the @tt{$a} subfield contains place of publication; the @tt{$b} subfield + contains the entity responsible for publication; and the @tt{$c} subfield + contains the date of publication. + There are several possible repetition patterns for these subfields which + require different semantic interpretations. To give a few examples: + + @itemlist[ + @item{ + @tt{a+bc}: multiple places of publication with the same publisher + @subitem{@tt{$aLondon ;$aNew York :$bRoutledge,$c2017.} [@hyperlink["https://catalog.loc.gov/vwebv/staffView?bibId=19255280" "source"]]} + } + @item{ + @tt{ab+c}: multiple publishers with the same place of publication + @subitem{@tt{$aNew York, NY :$bBarnes & Noble :$bSterling Publishing Co., Inc.,$c2012.} [@hyperlink["https://catalog.loc.gov/vwebv/staffView?bibId=17618487" "source"]]} + } + @item{ + @tt{(ab)+c}: multiple publications, each with different places and publishers + @subitem{@tt{$aBoston :$bLee and Shepard, publishers ;$aNew York :$bLee, Shepard, and Dillingham,$c1872.} [@hyperlink["https://catalog.loc.gov/vwebv/staffView?bibId=19048248" "source"]]} + }] + + Writing a regex to intelligently parse this information out of the string is + a pain, but regexes are an already popular and well understood tool in the + metadata community. Thus, @racket[marc-matcher] lets users specify regular + expressions that match subgroups within the field they want to parse, and + define variables they can use in their code containing the results of those + matches, which allows more complex kinds of processing to be done with + simpler code. + + @examples[#:eval marc-matcher-eval + (define parse-264 + (marc-matcher ([#px"ab" #:as place-entity-groups] + [#px"c" #:as date]) + (for/list ([group place-entity-groups]) + (cons (marc-subfield-data date) (map marc-subfield-data group))))) + (parse-264 "$aBoston :$bLee and Shepard, publishers ;$aNew York :$bLee, Shepard, and Dillingham,$c1872.") + ] + + The first clause of the @racket[marc-matcher] expression is a list of variable + definitions, similar to a parameter list for a lambda. + For example, the clause @racket[[#px"ab" :as place-entity-groups]] defines a + variable called @racket[place-entity-groups], which will be a list of all the + groups (which are themselves lists of structs) consisting of a single + subfield @racket[$a] followed by a single subfield @racket[$b]. + The second clause is the computation the user wishes to do with the values + extracted from the field, and can refer to the variables defined in the first + clause. + + Here is another example, using @hyperlink["https://www.loc.gov/marc/bibliographic/bd505.html" "table of contents"] + data [@hyperlink["https://catalog.loc.gov/vwebv/staffView?bibId=17682122" "source"]]. + + @examples[#:eval marc-matcher-eval + (define matcher + (marc-matcher ([#px"tr?" #:as title-info-groups]) + (for ([group title-info-groups]) + (define title (first (map marc-subfield-data + (filter (λ (sf) (equal? "t" (marc-subfield-subtag sf))) group)))) + (define authors (map marc-subfield-data + (filter (λ (sf) (equal? "r" (marc-subfield-subtag sf))) group))) + (printf "Title: ~a~a~n~n" + (string-trim title #px"( /\\s*)|( --\\s*)|\\.") + (if (empty? authors) + "" + (string-append "\nAuthor: " + (string-trim (first authors) #px"( /\\s*)|( --\\s*)|\\."))))))) + (matcher + (string-join '("$tCaveat Lector; or how I ransacked Wikipedias across the Multiverse soley " + "to amuse and edify readers -- $tMystery of the missing mothers / $rKristin King -- " + "$tSecrets of Flatland / $rAnne Toole -- $tSanyo TM-300 Home-Use Time Machine / " + "$rJeremy Sim -- $tElizabeth Burgoyne Corbett / $rL. Timmel Duchamp -- " + "$tBiographies.") + ""))] + + The macro definition parses the clauses for parameters and regexps, and then generates calls to run-time helper functions. + + @racketfile{marc-matcher.rkt} +} + +@defstruct[marc-subfield ([subtag any/c] [data any/c]) #:transparent] +