diff --git a/format.go b/format.go
index a4aa318..c5ddd3e 100644
--- a/format.go
+++ b/format.go
@@ -3,60 +3,65 @@ package main
 import (
 	"bytes"
 	"errors"
+	"go/ast"
+	"go/format"
+	"go/parser"
+	"go/token"
 	"strings"
 	"unicode"
 	"unicode/utf8"
-)
-
-// macroname is the name of the macro that applies math formatting.
-const macroname = "\\mathfmt"
-
-// Format processes the source code in b.
-func Format(b []byte) ([]byte, error) {
-	var buf bytes.Buffer
-	s := string(b)
-	for len(s) > 0 {
-		// Look for the next macro.
-		i := strings.Index(s, macroname)
-
-		// Exit if not found.
-		if i < 0 {
-			buf.WriteString(s)
-			break
-		}
 
-		// Write out up to the macro.
-		buf.WriteString(s[:i])
-		s = s[i:]
-
-		// Process the macro.
-		rest, err := macro(&buf, s[len(macroname):])
-		if err != nil {
-			return nil, err
-		}
-		s = rest
-	}
-
-	return buf.Bytes(), nil
-}
+	"golang.org/x/tools/go/ast/astutil"
+)
 
-// macro processes a macro starting at s. Note s begins at the character directly after the macro name.
-func macro(w *bytes.Buffer, s string) (string, error) {
-	if len(s) == 0 {
-		return "", errors.New("empty macro")
+// Format processes the source code.
+func Format(src []byte) ([]byte, error) {
+	// Parse.
+	fset := token.NewFileSet()
+	f, err := parser.ParseFile(fset, "", src, parser.ParseComments)
+	if err != nil {
+		return nil, err
 	}
 
-	arg, rest, err := parsebraces(s)
+	// Apply transform.
+	transformed := CommentTransform(f, func(text string) string {
+		newtext, errf := formula(text)
+		if errf != nil {
+			err = errf
+			return text
+		}
+		return newtext
+	})
 	if err != nil {
-		return "", err
+		return nil, err
 	}
 
-	n := len(arg)
-	if err := formula(w, arg[1:n-1]); err != nil {
-		return "", err
+	// Format.
+	buf := bytes.NewBuffer(nil)
+	if err := format.Node(buf, fset, transformed); err != nil {
+		return nil, err
 	}
+	return buf.Bytes(), nil
+}
 
-	return rest, nil
+// CommentTransform applies transform to the text of every comment under the root AST.
+func CommentTransform(root ast.Node, transform func(string) string) ast.Node {
+	return astutil.Apply(root, func(c *astutil.Cursor) bool {
+		switch n := c.Node().(type) {
+		case *ast.Comment:
+			c.Replace(&ast.Comment{
+				Slash: n.Slash,
+				Text:  transform(n.Text),
+			})
+		case *ast.File:
+			for _, g := range n.Comments {
+				for _, comment := range g.List {
+					comment.Text = transform(comment.Text)
+				}
+			}
+		}
+		return true
+	}, nil)
 }
 
 // Fixed data structures required for formula processing.
@@ -86,15 +91,16 @@ func init() {
 }
 
 // formula processes a formula in s, writing the result to w.
-func formula(w *bytes.Buffer, s string) error {
+func formula(s string) (string, error) {
 	if len(s) == 0 {
-		return nil
+		return "", nil
 	}
 
 	// Replace symbols.
 	s = replacer.Replace(s)
 
 	// Replace super/subscripts.
+	buf := bytes.NewBuffer(nil)
 	last := None
 	for len(s) > 0 {
 		r, size := utf8.DecodeRuneInString(s)
@@ -107,7 +113,7 @@ func formula(w *bytes.Buffer, s string) error {
 		case '_':
 			repl = sub
 		default:
-			w.WriteRune(r)
+			buf.WriteRune(r)
 			last = r
 			s = s[size:]
 			continue
@@ -116,19 +122,19 @@ func formula(w *bytes.Buffer, s string) error {
 		// Perform replacement.
 		if unicode.IsPrint(last) && !unicode.IsSpace(last) {
 			var err error
-			s, err = supsub(w, s, repl)
+			s, err = supsub(buf, s, repl)
 			if err != nil {
-				return err
+				return "", err
 			}
 		} else {
-			w.WriteRune(r)
+			buf.WriteRune(r)
 			s = s[size:]
 		}
 
 		last = None
 	}
 
-	return nil
+	return buf.String(), nil
 }
 
 // supsub processes a super/subscript starting at s, writing the result to w.
diff --git a/format_test.go b/format_test.go
index 700b35b..2811659 100644
--- a/format_test.go
+++ b/format_test.go
@@ -1,7 +1,6 @@
 package main
 
 import (
-	"bytes"
 	"testing"
 )
 
@@ -58,12 +57,10 @@ func TestFormula(t *testing.T) {
 	for _, c := range cases {
 		c := c // scopelint
 		t.Run(c.Name, func(t *testing.T) {
-			buf := bytes.NewBuffer(nil)
-			err := formula(buf, c.Input)
+			got, err := formula(c.Input)
 			if err != nil {
 				t.Fatal(err)
 			}
-			got := buf.String()
 			if got != c.Expect {
 				t.Logf("input  = %q", c.Input)
 				t.Logf("got    = %q", got)
diff --git a/go.mod b/go.mod
index 1ebf485..0a64045 100644
--- a/go.mod
+++ b/go.mod
@@ -1,3 +1,5 @@
 module github.com/mmcloughlin/mathfmt
 
 go 1.11
+
+require golang.org/x/tools v0.0.0-20200204230316-67a4523381ef
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..c1bacaf
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,12 @@
+golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
+golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
+golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg=
+golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
+golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
+golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
+golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
+golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
+golang.org/x/tools v0.0.0-20200204230316-67a4523381ef h1:mdhEDFpO1Tfj7PXIflIuP1tbXt4rJgHIvbzdh62SARw=
+golang.org/x/tools v0.0.0-20200204230316-67a4523381ef/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
+golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
diff --git a/testdata/p256.golden b/testdata/p256.golden
index f3c8fdf..aade117 100644
--- a/testdata/p256.golden
+++ b/testdata/p256.golden
@@ -372,7 +372,7 @@ func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 	// by adding multiplies of p without affecting the value.
 	//
 	// So we eliminate limbs from right to left. Since the bottom 29 bits of p
-	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] == 0.
+	// are all ones, then by adding tmp2[0]*p to tmp2 we'll make tmp2[0] ≡ 0.
 	// We can do that for 8 further limbs and then right shift to eliminate the
 	// extra factor of R.
 	for i := 0; ; i += 2 {
@@ -655,16 +655,16 @@ func p256Assign(out, in *[p256Limbs]uint32) {
 	*out = *in
 }
 
-// p256Invert calculates |out| = |in|^{-1}
+// p256Invert calculates |out| = |in|⁻¹
 //
 // Based on Fermat's Little Theorem:
-//   a^p = a (mod p)
-//   a^{p-1} = 1 (mod p)
-//   a^{p-2} = a^{-1} (mod p)
+//   aᵖ = a (mod p)
+//   aᵖ⁻¹ = 1 (mod p)
+//   aᵖ⁻² = a⁻¹ (mod p)
 func p256Invert(out, in *[p256Limbs]uint32) {
 	var ftmp, ftmp2 [p256Limbs]uint32
 
-	// each e_I will hold |in|^{2^I - 1}
+	// each e_I will hold |in|^{2ᴵ - 1}
 	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
 
 	p256Square(&ftmp, in)     // 2¹
@@ -939,7 +939,7 @@ func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
 	}
 }
 
-// p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
+// p256SelectAffinePoint sets {xOut,yOut} to the index'th entry of table.
 // On entry: index < 16, table[0] must be zero.
 func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
 	for i := range xOut {
@@ -966,7 +966,7 @@ func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index
 	}
 }
 
-// p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
+// p256SelectJacobianPoint sets {xOut,yOut,zOut} to the index'th entry of
 // table.
 // On entry: index < 16, table[0] must be zero.
 func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
diff --git a/testdata/p256.in b/testdata/p256.in
index 212b749..ce4e5e0 100644
--- a/testdata/p256.in
+++ b/testdata/p256.in
@@ -20,7 +20,7 @@ var (
 	p256Params *CurveParams
 
 	// RInverse contains 1/R mod p - the inverse of the Montgomery constant
-	// (\mathfmt{2^257}).
+	// (2^257).
 	p256RInverse *big.Int
 )
 
@@ -86,19 +86,19 @@ func (p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int)
 // Field elements are represented as nine, unsigned 32-bit words.
 //
 // The value of an field element is:
-//   \mathfmt{x[0] + (x[1] * 2^29) + (x[2] * 2^57) + ... + (x[8] * 2^228)}
+//   x[0] + (x[1] * 2^29) + (x[2] * 2^57) + ... + (x[8] * 2^228)
 //
 // That is, each limb is alternately 29 or 28-bits wide in little-endian
 // order.
 //
-// This means that a field element hits \mathfmt{2^257}, rather than \mathfmt{2^256} as we would
-// like. A 28, 29, ... pattern would cause us to hit \mathfmt{2^256}, but that causes
+// This means that a field element hits 2^257, rather than 2^256 as we would
+// like. A 28, 29, ... pattern would cause us to hit 2^256, but that causes
 // problems when multiplying as terms end up one bit short of a limb which
 // would require much bit-shifting to correct.
 //
 // Finally, the values stored in a field element are in Montgomery form. So the
 // value |y| is stored as (y*R) mod p, where p is the P-256 prime and R is
-// \mathfmt{2^257}.
+// 2^257.
 
 const (
 	p256Limbs    = 9
@@ -125,23 +125,23 @@ var (
 //   Index  |  Index (binary) | Value
 //       0  |           0000  | 0G (all zeros, omitted)
 //       1  |           0001  | G
-//       2  |           0010  | \mathfmt{2^64G}
-//       3  |           0011  | \mathfmt{2^64G + G}
-//       4  |           0100  | \mathfmt{2^128G}
-//       5  |           0101  | \mathfmt{2^128G + G}
-//       6  |           0110  | \mathfmt{2^128G + 2^64G}
-//       7  |           0111  | \mathfmt{2^128G + 2^64G + G}
-//       8  |           1000  | \mathfmt{2^192G}
-//       9  |           1001  | \mathfmt{2^192G + G}
-//      10  |           1010  | \mathfmt{2^192G + 2^64G}
-//      11  |           1011  | \mathfmt{2^192G + 2^64G + G}
-//      12  |           1100  | \mathfmt{2^192G + 2^128G}
-//      13  |           1101  | \mathfmt{2^192G + 2^128G + G}
-//      14  |           1110  | \mathfmt{2^192G + 2^128G + 2^64G}
-//      15  |           1111  | \mathfmt{2^192G + 2^128G + 2^64G + G}
+//       2  |           0010  | 2^64G
+//       3  |           0011  | 2^64G + G
+//       4  |           0100  | 2^128G
+//       5  |           0101  | 2^128G + G
+//       6  |           0110  | 2^128G + 2^64G
+//       7  |           0111  | 2^128G + 2^64G + G
+//       8  |           1000  | 2^192G
+//       9  |           1001  | 2^192G + G
+//      10  |           1010  | 2^192G + 2^64G
+//      11  |           1011  | 2^192G + 2^64G + G
+//      12  |           1100  | 2^192G + 2^128G
+//      13  |           1101  | 2^192G + 2^128G + G
+//      14  |           1110  | 2^192G + 2^128G + 2^64G
+//      15  |           1111  | 2^192G + 2^128G + 2^64G + G
 //
-// The second table follows the same style, but the terms are \mathfmt{2^32G},
-// \mathfmt{2^96G, 2^160G, 2^224G}.
+// The second table follows the same style, but the terms are 2^32G,
+// 2^96G, 2^160G, 2^224G.
 //
 // This is ~2KB of data.
 var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
@@ -210,23 +210,23 @@ var p256Precomputed = [p256Limbs * 2 * 15 * 2]uint32{
 // Field element operations:
 
 // nonZeroToAllOnes returns:
-//   \mathfmt{0xffffffff for 0 < x <= 2^31}
-//   \mathfmt{0 for x == 0 or x > 2^31.}
+//   0xffffffff for 0 < x <= 2^31
+//   0 for x == 0 or x > 2^31.
 func nonZeroToAllOnes(x uint32) uint32 {
 	return ((x - 1) >> 31) - 1
 }
 
 // p256ReduceCarry adds a multiple of p in order to cancel |carry|,
-// which is a term at \mathfmt{2^257}.
+// which is a term at 2^257.
 //
-// On entry: \mathfmt{carry < 2^3, inout[0,2,...] < 2^29, inout[1,3,...] < 2^28}.
-// On exit: \mathfmt{inout[0,2,..] < 2^30, inout[1,3,...] < 2^29}.
+// On entry: carry < 2^3, inout[0,2,...] < 2^29, inout[1,3,...] < 2^28.
+// On exit: inout[0,2,..] < 2^30, inout[1,3,...] < 2^29.
 func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
 	carry_mask := nonZeroToAllOnes(carry)
 
 	inout[0] += carry << 1
 	inout[3] += 0x10000000 & carry_mask
-	// \mathfmt{carry < 2^3 thus (carry << 11) < 2^14 and we added 2^28 in the}
+	// carry < 2^3 thus (carry << 11) < 2^14 and we added 2^28 in the
 	// previous line therefore this doesn't underflow.
 	inout[3] -= carry << 11
 	inout[4] += (0x20000000 - 1) & carry_mask
@@ -242,7 +242,7 @@ func p256ReduceCarry(inout *[p256Limbs]uint32, carry uint32) {
 // p256Sum sets out = in+in2.
 //
 // On entry, in[i]+in2[i] must not overflow a 32-bit word.
-// On exit: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}
+// On exit: out[0,2,...] < 2^30, out[1,3,...] < 2^29
 func p256Sum(out, in, in2 *[p256Limbs]uint32) {
 	carry := uint32(0)
 	for i := 0; ; i++ {
@@ -278,9 +278,9 @@ var p256Zero31 = [p256Limbs]uint32{two31m3, two30m2, two31m2, two30p13m2, two31m
 
 // p256Diff sets out = in-in2.
 //
-// On entry: \mathfmt{in[0,2,...] < 2^30, in[1,3,...] < 2^29 and}
-//           \mathfmt{in2[0,2,...] < 2^30, in2[1,3,...] < 2^29.}
-// On exit: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}.
+// On entry: in[0,2,...] < 2^30, in[1,3,...] < 2^29 and
+//           in2[0,2,...] < 2^30, in2[1,3,...] < 2^29.
+// On exit: out[0,2,...] < 2^30, out[1,3,...] < 2^29.
 func p256Diff(out, in, in2 *[p256Limbs]uint32) {
 	var carry uint32
 
@@ -310,12 +310,12 @@ func p256Diff(out, in, in2 *[p256Limbs]uint32) {
 // the same 29,28,... bit positions as an field element.
 //
 // The values in field elements are in Montgomery form: x*R mod p where R =
-// \mathfmt{2^257}. Since we just multiplied two Montgomery values together, the result
+// 2^257. Since we just multiplied two Montgomery values together, the result
 // is x*y*R*R mod p. We wish to divide by R in order for the result also to be
 // in Montgomery form.
 //
-// On entry: \mathfmt{tmp[i] < 2^64}
-// On exit: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}
+// On entry: tmp[i] < 2^64
+// On exit: out[0,2,...] < 2^30, out[1,3,...] < 2^29
 func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 	// The following table may be helpful when reading this code:
 	//
@@ -367,7 +367,7 @@ func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 
 	// Montgomery elimination of terms:
 	//
-	// Since R is \mathfmt{2^257}, we can divide by R with a bitwise shift if we can
+	// Since R is 2^257, we can divide by R with a bitwise shift if we can
 	// ensure that the right-most 257 bits are all zero. We can make that true
 	// by adding multiplies of p without affecting the value.
 	//
@@ -388,7 +388,7 @@ func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 		// The following table contains the amounts added to each word (as an
 		// offset from the value of i at the top of the loop). The amounts are
 		// accounted for from the first and second half of the loop separately
-		// and are written as, for example, 28 to mean a value \mathfmt{<2^28}.
+		// and are written as, for example, 28 to mean a value <2^28.
 		//
 		// Word:                   3   4   5   6   7   8   9   10
 		// Added in top half:     28  11      29  21  29  28
@@ -402,7 +402,7 @@ func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 		// the total value added will be the values added at 7, 5 and 3.
 		//
 		// The following table accumulates these values. The sums at the bottom
-		// are written as, for example, 29+28, to mean a value < \mathfmt{2^29+2^28}.
+		// are written as, for example, 29+28, to mean a value < 2^29+2^28.
 		//
 		// Word:                   3   4   5   6   7   8   9  10  11  12  13
 		//                        28  11  10  29  21  29  28  28  28  28  28
@@ -421,8 +421,8 @@ func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 		//                                                    11      11
 		//
 		// So the greatest amount is added to tmp2[10] and tmp2[12]. If
-		// \mathfmt{tmp2[10/12] has an initial value of <2^29, then the maximum value}
-		// \mathfmt{will be < 2^31 + 2^30 + 2^28 + 2^21 + 2^11, which is < 2^32,}
+		// tmp2[10/12] has an initial value of <2^29, then the maximum value
+		// will be < 2^31 + 2^30 + 2^28 + 2^21 + 2^11, which is < 2^32,
 		// as required.
 		tmp2[i+3] += (x << 10) & bottom28Bits
 		tmp2[i+4] += (x >> 18)
@@ -431,7 +431,7 @@ func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 		tmp2[i+7] += x >> 8
 
 		// At position 200, which is the starting bit position for word 7, we
-		// have a factor of 0xf000000 = \mathfmt{2^28 - 2^24}.
+		// have a factor of 0xf000000 = 2^28 - 2^24.
 		tmp2[i+7] += 0x10000000 & xMask
 		tmp2[i+8] += (x - 1) & xMask
 		tmp2[i+7] -= (x << 24) & bottom28Bits
@@ -458,7 +458,7 @@ func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 
 		// At position 199, which is the starting bit of the 8th word when
 		// dealing with a context starting on an odd word, we have a factor of
-		// \mathfmt{0x1e000000 = 2^29 - 2^25. Since we have not updated i, the 8th}
+		// 0x1e000000 = 2^29 - 2^25. Since we have not updated i, the 8th
 		// word from i+1 is i+8.
 		tmp2[i+8] += 0x20000000 & xMask
 		tmp2[i+9] += (x - 1) & xMask
@@ -470,12 +470,12 @@ func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 		tmp2[i+10] += (x - 1) & xMask
 	}
 
-	// We merge the right shift with a carry chain. The words above \mathfmt{2^257} have
+	// We merge the right shift with a carry chain. The words above 2^257 have
 	// widths of 28,29,... which we need to correct when copying them down.
 	carry = 0
 	for i := 0; i < 8; i++ {
 		// The maximum value of tmp2[i + 9] occurs on the first iteration and
-		// is \mathfmt{< 2^30+2^29+2^28. Adding 2^29 (from tmp2[i + 10])} is
+		// is < 2^30+2^29+2^28. Adding 2^29 (from tmp2[i + 10]) is
 		// therefore safe.
 		out[i] = tmp2[i+9]
 		out[i] += carry
@@ -500,8 +500,8 @@ func p256ReduceDegree(out *[p256Limbs]uint32, tmp [17]uint64) {
 
 // p256Square sets out=in*in.
 //
-// On entry: \mathfmt{in[0,2,...] < 2^30, in[1,3,...] < 2^29}.
-// On exit: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}.
+// On entry: in[0,2,...] < 2^30, in[1,3,...] < 2^29.
+// On exit: out[0,2,...] < 2^30, out[1,3,...] < 2^29.
 func p256Square(out, in *[p256Limbs]uint32) {
 	var tmp [17]uint64
 
@@ -525,8 +525,8 @@ func p256Square(out, in *[p256Limbs]uint32) {
 		uint64(in[1])*(uint64(in[6])<<1) +
 		uint64(in[2])*(uint64(in[5])<<1) +
 		uint64(in[3])*(uint64(in[4])<<1)
-	// tmp[8] has the greatest value of \mathfmt{2^61 + 2^60 + 2^61 + 2^60 + 2^60},
-	// which is \mathfmt{< 2^64} as required.
+	// tmp[8] has the greatest value of 2^61 + 2^60 + 2^61 + 2^60 + 2^60,
+	// which is < 2^64 as required.
 	tmp[8] = uint64(in[0])*(uint64(in[8])<<1) +
 		uint64(in[1])*(uint64(in[7])<<2) +
 		uint64(in[2])*(uint64(in[6])<<1) +
@@ -558,9 +558,9 @@ func p256Square(out, in *[p256Limbs]uint32) {
 
 // p256Mul sets out=in*in2.
 //
-// On entry: \mathfmt{in[0,2,...] < 2^30, in[1,3,...] < 2^29} and
-//           \mathfmt{in2[0,2,...] < 2^30, in2[1,3,...] < 2^29}.
-// On exit: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}.
+// On entry: in[0,2,...] < 2^30, in[1,3,...] < 2^29 and
+//           in2[0,2,...] < 2^30, in2[1,3,...] < 2^29.
+// On exit: out[0,2,...] < 2^30, out[1,3,...] < 2^29.
 func p256Mul(out, in, in2 *[p256Limbs]uint32) {
 	var tmp [17]uint64
 
@@ -667,65 +667,65 @@ func p256Invert(out, in *[p256Limbs]uint32) {
 	// each e_I will hold |in|^{2^I - 1}
 	var e2, e4, e8, e16, e32, e64 [p256Limbs]uint32
 
-	p256Square(&ftmp, in)     // \mathfmt{2^1}
-	p256Mul(&ftmp, in, &ftmp) // \mathfmt{2^2 - 2^0}
+	p256Square(&ftmp, in)     // 2^1
+	p256Mul(&ftmp, in, &ftmp) // 2^2 - 2^0
 	p256Assign(&e2, &ftmp)
-	p256Square(&ftmp, &ftmp)   // \mathfmt{2^3 - 2^1}
-	p256Square(&ftmp, &ftmp)   // \mathfmt{2^4 - 2^2}
-	p256Mul(&ftmp, &ftmp, &e2) // \mathfmt{2^4 - 2^0}
+	p256Square(&ftmp, &ftmp)   // 2^3 - 2^1
+	p256Square(&ftmp, &ftmp)   // 2^4 - 2^2
+	p256Mul(&ftmp, &ftmp, &e2) // 2^4 - 2^0
 	p256Assign(&e4, &ftmp)
-	p256Square(&ftmp, &ftmp)   // \mathfmt{2^5 - 2^1}
-	p256Square(&ftmp, &ftmp)   // \mathfmt{2^6 - 2^2}
-	p256Square(&ftmp, &ftmp)   // \mathfmt{2^7 - 2^3}
-	p256Square(&ftmp, &ftmp)   // \mathfmt{2^8 - 2^4}
-	p256Mul(&ftmp, &ftmp, &e4) // \mathfmt{2^8 - 2^0}
+	p256Square(&ftmp, &ftmp)   // 2^5 - 2^1
+	p256Square(&ftmp, &ftmp)   // 2^6 - 2^2
+	p256Square(&ftmp, &ftmp)   // 2^7 - 2^3
+	p256Square(&ftmp, &ftmp)   // 2^8 - 2^4
+	p256Mul(&ftmp, &ftmp, &e4) // 2^8 - 2^0
 	p256Assign(&e8, &ftmp)
 	for i := 0; i < 8; i++ {
 		p256Square(&ftmp, &ftmp)
-	} // \mathfmt{2^16 - 2^8}
-	p256Mul(&ftmp, &ftmp, &e8) // \mathfmt{2^16 - 2^0}
+	} // 2^16 - 2^8
+	p256Mul(&ftmp, &ftmp, &e8) // 2^16 - 2^0
 	p256Assign(&e16, &ftmp)
 	for i := 0; i < 16; i++ {
 		p256Square(&ftmp, &ftmp)
-	} // \mathfmt{2^32 - 2^16}
-	p256Mul(&ftmp, &ftmp, &e16) // \mathfmt{2^32 - 2^0}
+	} // 2^32 - 2^16
+	p256Mul(&ftmp, &ftmp, &e16) // 2^32 - 2^0
 	p256Assign(&e32, &ftmp)
 	for i := 0; i < 32; i++ {
 		p256Square(&ftmp, &ftmp)
-	} // \mathfmt{2^64 - 2^32}
+	} // 2^64 - 2^32
 	p256Assign(&e64, &ftmp)
-	p256Mul(&ftmp, &ftmp, in) // \mathfmt{2^64 - 2^32 + 2^0}
+	p256Mul(&ftmp, &ftmp, in) // 2^64 - 2^32 + 2^0
 	for i := 0; i < 192; i++ {
 		p256Square(&ftmp, &ftmp)
-	} // \mathfmt{2^256 - 2^224 + 2^192}
+	} // 2^256 - 2^224 + 2^192
 
-	p256Mul(&ftmp2, &e64, &e32) // \mathfmt{2^64 - 2^0}
+	p256Mul(&ftmp2, &e64, &e32) // 2^64 - 2^0
 	for i := 0; i < 16; i++ {
 		p256Square(&ftmp2, &ftmp2)
-	} // \mathfmt{2^80 - 2^16}
-	p256Mul(&ftmp2, &ftmp2, &e16) // \mathfmt{2^80 - 2^0}
+	} // 2^80 - 2^16
+	p256Mul(&ftmp2, &ftmp2, &e16) // 2^80 - 2^0
 	for i := 0; i < 8; i++ {
 		p256Square(&ftmp2, &ftmp2)
-	} // \mathfmt{2^88 - 2^8}
-	p256Mul(&ftmp2, &ftmp2, &e8) // \mathfmt{2^88 - 2^0}
+	} // 2^88 - 2^8
+	p256Mul(&ftmp2, &ftmp2, &e8) // 2^88 - 2^0
 	for i := 0; i < 4; i++ {
 		p256Square(&ftmp2, &ftmp2)
-	} // \mathfmt{2^92 - 2^4}
-	p256Mul(&ftmp2, &ftmp2, &e4) // \mathfmt{2^92 - 2^0}
-	p256Square(&ftmp2, &ftmp2)   // \mathfmt{2^93 - 2^1}
-	p256Square(&ftmp2, &ftmp2)   // \mathfmt{2^94 - 2^2}
-	p256Mul(&ftmp2, &ftmp2, &e2) // \mathfmt{2^94 - 2^0}
-	p256Square(&ftmp2, &ftmp2)   // \mathfmt{2^95 - 2^1}
-	p256Square(&ftmp2, &ftmp2)   // \mathfmt{2^96 - 2^2}
-	p256Mul(&ftmp2, &ftmp2, in)  // \mathfmt{2^96 - 3}
-
-	p256Mul(out, &ftmp2, &ftmp) // \mathfmt{2^256 - 2^224 + 2^192 + 2^96 - 3}
+	} // 2^92 - 2^4
+	p256Mul(&ftmp2, &ftmp2, &e4) // 2^92 - 2^0
+	p256Square(&ftmp2, &ftmp2)   // 2^93 - 2^1
+	p256Square(&ftmp2, &ftmp2)   // 2^94 - 2^2
+	p256Mul(&ftmp2, &ftmp2, &e2) // 2^94 - 2^0
+	p256Square(&ftmp2, &ftmp2)   // 2^95 - 2^1
+	p256Square(&ftmp2, &ftmp2)   // 2^96 - 2^2
+	p256Mul(&ftmp2, &ftmp2, in)  // 2^96 - 3
+
+	p256Mul(out, &ftmp2, &ftmp) // 2^256 - 2^224 + 2^192 + 2^96 - 3
 }
 
 // p256Scalar3 sets out=3*out.
 //
-// On entry: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}.
-// On exit: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}.
+// On entry: out[0,2,...] < 2^30, out[1,3,...] < 2^29.
+// On exit: out[0,2,...] < 2^30, out[1,3,...] < 2^29.
 func p256Scalar3(out *[p256Limbs]uint32) {
 	var carry uint32
 
@@ -751,8 +751,8 @@ func p256Scalar3(out *[p256Limbs]uint32) {
 
 // p256Scalar4 sets out=4*out.
 //
-// On entry: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}.
-// On exit: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}.
+// On entry: out[0,2,...] < 2^30, out[1,3,...] < 2^29.
+// On exit: out[0,2,...] < 2^30, out[1,3,...] < 2^29.
 func p256Scalar4(out *[p256Limbs]uint32) {
 	var carry, nextCarry uint32
 
@@ -781,8 +781,8 @@ func p256Scalar4(out *[p256Limbs]uint32) {
 
 // p256Scalar8 sets out=8*out.
 //
-// On entry: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}.
-// On exit: \mathfmt{out[0,2,...] < 2^30, out[1,3,...] < 2^29}.
+// On entry: out[0,2,...] < 2^30, out[1,3,...] < 2^29.
+// On exit: out[0,2,...] < 2^30, out[1,3,...] < 2^29.
 func p256Scalar8(out *[p256Limbs]uint32) {
 	var carry, nextCarry uint32
 
@@ -939,7 +939,7 @@ func p256CopyConditional(out, in *[p256Limbs]uint32, mask uint32) {
 	}
 }
 
-// p256SelectAffinePoint sets {out_x,out_y} to the index'th entry of table.
+// p256SelectAffinePoint sets {xOut,yOut} to the index'th entry of table.
 // On entry: index < 16, table[0] must be zero.
 func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index uint32) {
 	for i := range xOut {
@@ -966,7 +966,7 @@ func p256SelectAffinePoint(xOut, yOut *[p256Limbs]uint32, table []uint32, index
 	}
 }
 
-// p256SelectJacobianPoint sets {out_x,out_y,out_z} to the index'th entry of
+// p256SelectJacobianPoint sets {xOut,yOut,zOut} to the index'th entry of
 // table.
 // On entry: index < 16, table[0] must be zero.
 func p256SelectJacobianPoint(xOut, yOut, zOut *[p256Limbs]uint32, table *[16][3][p256Limbs]uint32, index uint32) {
diff --git a/testdata/poly1305.golden b/testdata/poly1305.golden
index 1187eab..edc0fa1 100644
--- a/testdata/poly1305.golden
+++ b/testdata/poly1305.golden
@@ -174,18 +174,18 @@ func updateGeneric(state *macState, msg []byte) {
 		//
 		// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
 		//
-		//                        h2    h1    h0  x
+		//                        h2    h1    h0  ×
 		//                              r1    r0  =
 		//                       ----------------
-		//                      h2r0  h1r0  h0r0     <-- individual 128-bit products
+		//                      h2r0  h1r0  h0r0     ← individual 128-bit products
 		//            +   h2r1  h1r1  h0r1
 		//               ------------------------
-		//                 m3    m2    m1    m0      <-- result in 128-bit overlapping limbs
+		//                 m3    m2    m1    m0      ← result in 128-bit overlapping limbs
 		//               ------------------------
-		//         m3.hi m2.hi m1.hi m0.hi           <-- carry propagation
+		//         m3.hi m2.hi m1.hi m0.hi           ← carry propagation
 		//     +         m3.lo m2.lo m1.lo m0.lo
 		//        -------------------------------
-		//           t4    t3    t2    t1    t0      <-- final result in 64-bit limbs
+		//           t4    t3    t2    t1    t0      ← final result in 64-bit limbs
 		//
 		// The main difference from pen-and-paper multiplication is that we do
 		// carry propagation in a separate step, as if we wrote two digit sums
@@ -263,7 +263,7 @@ const (
 	maskNotLow2Bits uint64 = ^maskLow2Bits
 )
 
-// select64 returns x if v == 1 and y if v == 0, in constant time.
+// select64 returns x if v ≡ 1 and y if v ≡ 0, in constant time.
 func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }
 
 // [p0, p1, p2] is 2¹³⁰ - 5 in little endian order.
diff --git a/testdata/poly1305.in b/testdata/poly1305.in
index c3a14f0..92f9c50 100644
--- a/testdata/poly1305.in
+++ b/testdata/poly1305.in
@@ -12,14 +12,14 @@ import "encoding/binary"
 // Poly1305 [RFC 7539] is a relatively simple algorithm: the authentication tag
 // for a 64 bytes message is approximately
 //
-//     \mathfmt{s + m[0:16] * r^4 + m[16:32] * r^3 + m[32:48] * r^2 + m[48:64] * r  mod  2^130 - 5}
+//     s + m[0:16] * r^4 + m[16:32] * r^3 + m[32:48] * r^2 + m[48:64] * r  mod  2^130 - 5
 //
 // for some secret r and s. It can be computed sequentially like
 //
 //     for len(msg) > 0:
 //         h += read(msg, 16)
 //         h *= r
-//         \mathfmt{h %= 2^130 - 5}
+//         h %= 2^130 - 5
 //     return h + s
 //
 // All the complexity is about doing performant constant-time math on numbers
@@ -37,9 +37,9 @@ func newMACGeneric(key *[32]byte) (h macGeneric) {
 }
 
 // macState holds numbers in saturated 64-bit little-endian limbs. That is,
-// the value of [x0, x1, x2] is \mathfmt{x[0] + x[1] * 2^64 + x[2] * 2^128}.
+// the value of [x0, x1, x2] is x[0] + x[1] * 2^64 + x[2] * 2^128.
 type macState struct {
-	// h is the main accumulator. It is to be interpreted modulo \mathfmt{2^130 - 5}, but
+	// h is the main accumulator. It is to be interpreted modulo 2^130 - 5, but
 	// can grow larger during and after rounds.
 	h [3]uint64
 	// r and s are the private key components.
@@ -133,7 +133,7 @@ func shiftRightBy2(a uint128) uint128 {
 // updateGeneric absorbs msg into the state.h accumulator. For each chunk m of
 // 128 bits of message, it computes
 //
-//     \mathfmt{h_+ = (h + m) * r  mod  2^130 - 5}
+//     h_+ = (h + m) * r  mod  2^130 - 5
 //
 // If the msg length is not a multiple of TagSize, it assumes the last
 // incomplete chunk is the final one.
@@ -145,12 +145,12 @@ func updateGeneric(state *macState, msg []byte) {
 		var c uint64
 
 		// For the first step, h + m, we use a chain of bits.Add64 intrinsics.
-		// The resulting value of h might exceed \mathfmt{2^130 - 5}, but will be partially
+		// The resulting value of h might exceed 2^130 - 5, but will be partially
 		// reduced at the end of the multiplication below.
 		//
 		// The spec requires us to set a bit just above the message size, not to
 		// hide leading zeroes. For full chunks, that's 1 << 128, so we can just
-		// add 1 to the most significant (\mathfmt{2^128}) limb, h2.
+		// add 1 to the most significant (2^128) limb, h2.
 		if len(msg) >= TagSize {
 			h0, c = bitsAdd64(h0, binary.LittleEndian.Uint64(msg[0:8]), 0)
 			h1, c = bitsAdd64(h1, binary.LittleEndian.Uint64(msg[8:16]), c)
@@ -174,18 +174,18 @@ func updateGeneric(state *macState, msg []byte) {
 		//
 		// We are multiplying a 3 limbs number, h, by a 2 limbs number, r.
 		//
-		//                        h2    h1    h0  x
+		//                        h2    h1    h0  \times
 		//                              r1    r0  =
 		//                       ----------------
-		//                      h2r0  h1r0  h0r0     <-- individual 128-bit products
+		//                      h2r0  h1r0  h0r0     <- individual 128-bit products
 		//            +   h2r1  h1r1  h0r1
 		//               ------------------------
-		//                 m3    m2    m1    m0      <-- result in 128-bit overlapping limbs
+		//                 m3    m2    m1    m0      <- result in 128-bit overlapping limbs
 		//               ------------------------
-		//         m3.hi m2.hi m1.hi m0.hi           <-- carry propagation
+		//         m3.hi m2.hi m1.hi m0.hi           <- carry propagation
 		//     +         m3.lo m2.lo m1.lo m0.lo
 		//        -------------------------------
-		//           t4    t3    t2    t1    t0      <-- final result in 64-bit limbs
+		//           t4    t3    t2    t1    t0      <- final result in 64-bit limbs
 		//
 		// The main difference from pen-and-paper multiplication is that we do
 		// carry propagation in a separate step, as if we wrote two digit sums
@@ -221,18 +221,18 @@ func updateGeneric(state *macState, msg []byte) {
 		t3, _ := bitsAdd64(m3.lo, m2.hi, c)
 
 		// Now we have the result as 4 64-bit limbs, and we need to reduce it
-		// modulo \mathfmt{2^130 - 5}. The special shape of this Crandall prime lets us do
+		// modulo 2^130 - 5. The special shape of this Crandall prime lets us do
 		// a cheap partial reduction according to the reduction identity
 		//
-		//     \mathfmt{c * 2^130 + n  =  c * 5 + n  mod  2^130 - 5}
+		//     c * 2^130 + n  =  c * 5 + n  mod  2^130 - 5
 		//
-		// because \mathfmt{2^130 = 5 mod 2^130 - 5}. Partial reduction since the result is
-		// likely to be larger than \mathfmt{2^130 - 5}, but still small enough to fit the
+		// because 2^130 = 5 mod 2^130 - 5. Partial reduction since the result is
+		// likely to be larger than 2^130 - 5, but still small enough to fit the
 		// assumptions we make about h in the rest of the code.
 		//
 		// See also https://speakerdeck.com/gtank/engineering-prime-numbers?slide=23
 
-		// We split the final result at the \mathfmt{2^130} mark into h and cc, the carry.
+		// We split the final result at the 2^130 mark into h and cc, the carry.
 		// Note that the carry bits are effectively shifted left by 2, in other
 		// words, cc = c * 4 for the c in the reduction identity.
 		h0, h1, h2 = t0, t1, t2&maskLow2Bits
@@ -252,7 +252,7 @@ func updateGeneric(state *macState, msg []byte) {
 
 		// h2 is at most 3 + 1 + 1 = 5, making the whole of h at most
 		//
-		//     \mathfmt{5 * 2^128 + (2^128 - 1) = 6 * 2^128 - 1}
+		//     5 * 2^128 + (2^128 - 1) = 6 * 2^128 - 1
 	}
 
 	state.h[0], state.h[1], state.h[2] = h0, h1, h2
@@ -266,7 +266,7 @@ const (
 // select64 returns x if v == 1 and y if v == 0, in constant time.
 func select64(v, x, y uint64) uint64 { return ^(v-1)&x | (v-1)&y }
 
-// [p0, p1, p2] is \mathfmt{2^130 - 5} in little endian order.
+// [p0, p1, p2] is 2^130 - 5 in little endian order.
 const (
 	p0 = 0xFFFFFFFFFFFFFFFB
 	p1 = 0xFFFFFFFFFFFFFFFF
@@ -275,14 +275,14 @@ const (
 
 // finalize completes the modular reduction of h and computes
 //
-//     \mathfmt{out = h + s  mod  2^128}
+//     out = h + s  mod  2^128
 //
 func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
 	h0, h1, h2 := h[0], h[1], h[2]
 
 	// After the partial reduction in updateGeneric, h might be more than
-	// \mathfmt{2^130 - 5}, but will be less than \mathfmt{2 * (2^130 - 5)}. To complete the reduction
-	// in constant time, we compute \mathfmt{t = h - (2^130 - 5)}, and select h as the
+	// 2^130 - 5, but will be less than 2 * (2^130 - 5). To complete the reduction
+	// in constant time, we compute t = h - (2^130 - 5), and select h as the
 	// result if the subtraction underflows, and t otherwise.
 
 	hMinusP0, b := bitsSub64(h0, p0, 0)
@@ -295,7 +295,7 @@ func finalize(out *[TagSize]byte, h *[3]uint64, s *[2]uint64) {
 
 	// Finally, we compute the last Poly1305 step
 	//
-	//     \mathfmt{tag = h + s  mod  2^128}
+	//     tag = h + s  mod  2^128
 	//
 	// by just doing a wide addition with the 128 low bits of h and discarding
 	// the overflow.