Skip to content

Commit

Permalink
add marc-matcher
Browse files Browse the repository at this point in the history
  • Loading branch information
bennn committed Oct 27, 2021
1 parent 4977aeb commit 069aa56
Show file tree
Hide file tree
Showing 6 changed files with 242 additions and 0 deletions.
1 change: 1 addition & 0 deletions index.scrbl
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
@include-example{multi-check-true}
@include-example{define-datum-literal-set}
@include-example{rec-contract}
@include-example{marc-matcher}
@include-example{struct-list}
@include-example{syntax-class-contract}
@include-example{except-in-quiet}
Expand Down
40 changes: 40 additions & 0 deletions marc-matcher/marc-matcher-helpers.rkt
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#lang racket

(provide get-subfield-data
simplify-groups
(struct-out marc-subfield))

(struct marc-subfield (subtag data) #:transparent)

(define (parse-data marc-str [sep "$"])
(for/list ([sf (string-split marc-str sep)])
(marc-subfield (substring sf 0 1)
(substring sf 1 (string-length sf)))))

(define (filter-subfields groups subfields)
(define (helper group remaining acc)
(cond [(or (empty? group) (empty? remaining)) (values (reverse acc) remaining)]
[(equal? (first group) (marc-subfield-subtag (first remaining)))
(helper (rest group) (rest remaining) (cons (first remaining) acc))]
[else (helper group (rest remaining) acc)]))
(cond [(or (empty? groups) (empty? subfields)) '()]
[else
(let-values ([(result remaining) (helper (first groups) subfields '())])
(cons result
(filter-subfields (rest groups) remaining)))]))

(define (str->strlist str)
(map string (string->list str)))

(define (simplify-groups groups)
(match groups
[(list (list (marc-subfield t data)))
(marc-subfield t data)]
[_ groups]))

(define (get-subfield-data regexps subfield-str [sep "$"])
(define subfields (parse-data subfield-str sep))
(define subtags (string-join (map marc-subfield-subtag subfields) ""))
(define subfield-groups (map (λ (re) (map str->strlist (regexp-match* re subtags))) regexps))
(for/list ([group subfield-groups])
(filter-subfields group subfields)))
8 changes: 8 additions & 0 deletions marc-matcher/marc-matcher-syntax-classes.rkt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#lang racket
(provide marc-var-defn)

(require syntax/parse)

(define-syntax-class marc-var-defn
#:auto-nested-attributes
(pattern (re:regexp #:as name:id)))
58 changes: 58 additions & 0 deletions marc-matcher/marc-matcher-test.rkt
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#lang racket
(module+ test
(require rackunit syntax-parse-example/marc-matcher/marc-matcher)

(test-case "parse-264"
(define parse-264
(marc-matcher ([#px"ab" #:as place-entity-groups]
[#px"c" #:as date])
(for/list ([group place-entity-groups])
(cons (marc-subfield-data date) (map marc-subfield-data group)))))
(check-equal?
(parse-264 "$aBoston :$bLee and Shepard, publishers ;$aNew York :$bLee, Shepard, and Dillingham,$c1872.")
'(("1872." "Boston :" "Lee and Shepard, publishers ;")
("1872." "New York :" "Lee, Shepard, and Dillingham,"))))

(test-case "table-of-contents"
(define matcher
(marc-matcher ([#px"tr?" #:as title-info-groups])
(for ([group title-info-groups])
(define title (first (map marc-subfield-data
(filter (λ (sf) (equal? "t" (marc-subfield-subtag sf))) group))))
(define authors (map marc-subfield-data
(filter (λ (sf) (equal? "r" (marc-subfield-subtag sf))) group)))
(printf "Title: ~a~a~n~n"
(string-trim title #px"( /\\s*)|( --\\s*)|\\.")
(if (empty? authors)
""
(string-append "\nAuthor: "
(string-trim (first authors) #px"( /\\s*)|( --\\s*)|\\.")))))))
(define data
'("$tCaveat Lector; or how I ransacked Wikipedias across the Multiverse soley "
"to amuse and edify readers -- $tMystery of the missing mothers / $rKristin King -- "
"$tSecrets of Flatland / $rAnne Toole -- $tSanyo TM-300 Home-Use Time Machine / "
"$rJeremy Sim -- $tElizabeth Burgoyne Corbett / $rL. Timmel Duchamp -- "
"$tBiographies."))
(check-equal?
(with-output-to-string
(lambda () (matcher (string-join data ""))))
(string-join
'("Title: Caveat Lector; or how I ransacked Wikipedias across the Multiverse soley to amuse and edify readers"
""
"Title: Mystery of the missing mothers"
"Author: Kristin King"
""
"Title: Secrets of Flatland"
"Author: Anne Toole"
""
"Title: Sanyo TM-300 Home-Use Time Machine"
"Author: Jeremy Sim"
""
"Title: Elizabeth Burgoyne Corbett"
"Author: L. Timmel Duchamp"
""
"Title: Biographies"
""
"")
"\n")))
)
15 changes: 15 additions & 0 deletions marc-matcher/marc-matcher.rkt
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#lang racket
(provide marc-matcher (struct-out marc-subfield))

(require syntax/parse/define
syntax-parse-example/marc-matcher/marc-matcher-helpers
(for-syntax syntax-parse-example/marc-matcher/marc-matcher-syntax-classes))

(define-syntax (marc-matcher stx)
(syntax-parse stx
[(_ (var:marc-var-defn ...) body:expr ...)
(define params #'(var.name ...))
(define regexps #'(var.re ...))
#`(λ (input [sep "$"])
(define args (get-subfield-data '#,regexps input sep))
(apply (λ #,params (begin body ...)) (map simplify-groups args)))]))
120 changes: 120 additions & 0 deletions marc-matcher/marc-matcher.scrbl
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
#lang syntax-parse-example
@require[
(for-label racket/base racket/contract/base syntax/parse syntax-parse-example/marc-matcher/marc-matcher)]

@(define marc-matcher-eval
(make-base-eval '(require racket/string racket/list syntax-parse-example/marc-matcher/marc-matcher)))

@(define (subitem . elem*) (itemlist (apply item elem*)))

@title{@tt{marc-matcher}}
@stxbee2021["hzafar" 4]

@; =============================================================================

@defmodule[syntax-parse-example/marc-matcher/marc-matcher]{}

This is a very domain-specific macro, developed for a particular bibliographic
metadata use-case.

@defform[(marc-matcher ([re #:as name] ...) body ...)]{
This macro aims to make it easier to do regex-like matching over a structured
bibliographic data format known as
@hyperlink["https://www.loc.gov/marc/bibliographic/" "MARC 21"].
MARC records contain a sequence of fields whose data are string values that
look like this:

@nested[#:style 'code-inset
@tt{$aCarroll, Lewis,$d1832-1898,$eauthor.}]

In each field, individual subfields are separated using a separator character
(in this case @tt{$}); the character immediately following the separator is
called the subtag; and the substring up to the next separator or end-of-string
is the subfield data. So in the example above, there are three subfields,
@tt{$a}, @tt{$d}, and @tt{$e}, whose data are, respectively, @tt{Carroll, Lewis},
@tt{1832-1898}, and @tt{author}.

Parsing subfields out of this is often done using regular expressions, but it
gets really difficult when trying to deal with subfield repetitions. I'll use
@hyperlink["https://www.loc.gov/marc/bibliographic/bd264.html" "field 264"]
to illustrate.
This field mainly contains the following pieces of publication information:
the @tt{$a} subfield contains place of publication; the @tt{$b} subfield
contains the entity responsible for publication; and the @tt{$c} subfield
contains the date of publication.
There are several possible repetition patterns for these subfields which
require different semantic interpretations. To give a few examples:

@itemlist[
@item{
@tt{a+bc}: multiple places of publication with the same publisher
@subitem{@tt{$aLondon ;$aNew York :$bRoutledge,$c2017.} [@hyperlink["https://catalog.loc.gov/vwebv/staffView?bibId=19255280" "source"]]}
}
@item{
@tt{ab+c}: multiple publishers with the same place of publication
@subitem{@tt{$aNew York, NY :$bBarnes & Noble :$bSterling Publishing Co., Inc.,$c2012.} [@hyperlink["https://catalog.loc.gov/vwebv/staffView?bibId=17618487" "source"]]}
}
@item{
@tt{(ab)+c}: multiple publications, each with different places and publishers
@subitem{@tt{$aBoston :$bLee and Shepard, publishers ;$aNew York :$bLee, Shepard, and Dillingham,$c1872.} [@hyperlink["https://catalog.loc.gov/vwebv/staffView?bibId=19048248" "source"]]}
}]

Writing a regex to intelligently parse this information out of the string is
a pain, but regexes are an already popular and well understood tool in the
metadata community. Thus, @racket[marc-matcher] lets users specify regular
expressions that match subgroups within the field they want to parse, and
define variables they can use in their code containing the results of those
matches, which allows more complex kinds of processing to be done with
simpler code.

@examples[#:eval marc-matcher-eval
(define parse-264
(marc-matcher ([#px"ab" #:as place-entity-groups]
[#px"c" #:as date])
(for/list ([group place-entity-groups])
(cons (marc-subfield-data date) (map marc-subfield-data group)))))
(parse-264 "$aBoston :$bLee and Shepard, publishers ;$aNew York :$bLee, Shepard, and Dillingham,$c1872.")
]

The first clause of the @racket[marc-matcher] expression is a list of variable
definitions, similar to a parameter list for a lambda.
For example, the clause @racket[[#px"ab" :as place-entity-groups]] defines a
variable called @racket[place-entity-groups], which will be a list of all the
groups (which are themselves lists of structs) consisting of a single
subfield @racket[$a] followed by a single subfield @racket[$b].
The second clause is the computation the user wishes to do with the values
extracted from the field, and can refer to the variables defined in the first
clause.

Here is another example, using @hyperlink["https://www.loc.gov/marc/bibliographic/bd505.html" "table of contents"]
data [@hyperlink["https://catalog.loc.gov/vwebv/staffView?bibId=17682122" "source"]].

@examples[#:eval marc-matcher-eval
(define matcher
(marc-matcher ([#px"tr?" #:as title-info-groups])
(for ([group title-info-groups])
(define title (first (map marc-subfield-data
(filter (λ (sf) (equal? "t" (marc-subfield-subtag sf))) group))))
(define authors (map marc-subfield-data
(filter (λ (sf) (equal? "r" (marc-subfield-subtag sf))) group)))
(printf "Title: ~a~a~n~n"
(string-trim title #px"( /\\s*)|( --\\s*)|\\.")
(if (empty? authors)
""
(string-append "\nAuthor: "
(string-trim (first authors) #px"( /\\s*)|( --\\s*)|\\.")))))))
(matcher
(string-join '("$tCaveat Lector; or how I ransacked Wikipedias across the Multiverse soley "
"to amuse and edify readers -- $tMystery of the missing mothers / $rKristin King -- "
"$tSecrets of Flatland / $rAnne Toole -- $tSanyo TM-300 Home-Use Time Machine / "
"$rJeremy Sim -- $tElizabeth Burgoyne Corbett / $rL. Timmel Duchamp -- "
"$tBiographies.")
""))]

The macro definition parses the clauses for parameters and regexps, and then generates calls to run-time helper functions.

@racketfile{marc-matcher.rkt}
}

@defstruct[marc-subfield ([subtag any/c] [data any/c]) #:transparent]

0 comments on commit 069aa56

Please sign in to comment.