Add codepoints parser

bwrrp · Mar 31, 2022 · 04025af · 04025af
1 parent ec91556
commit 04025af
Show file tree

Hide file tree

Showing 7 changed files with 186 additions and 4 deletions.
diff --git a/api/prsc.api.json b/api/prsc.api.json
@@ -172,7 +172,7 @@
         {
           "kind": "Function",
           "canonicalReference": "prsc!codepoint:function(1)",
-          "docComment": "/**\n * Creates a Parser that skips the next code point if the given predicate returns true.\n *\n * This counts in unicode characters (code points), not UTF-16 code units.\n *\n * @param isMatch - callback called with the next codepoint, should return whether that matches\n *\n * @public\n */\n",
+          "docComment": "/**\n * Creates a Parser that skips the next code point if the given predicate returns true.\n *\n * This counts in unicode characters (code points), not UTF-16 code units.\n *\n * To match a sequence of code points, consider using `codepoints` instead.\n *\n * @param isMatch - callback called with the next codepoint, should return whether that matches\n *\n * @param expected - expected strings to return if the codepoint does not match\n *\n * @public\n */\n",
           "excerptTokens": [
             {
               "kind": "Content",
@@ -232,6 +232,69 @@
           ],
           "name": "codepoint"
         },
+        {
+          "kind": "Function",
+          "canonicalReference": "prsc!codepoints:function(1)",
+          "docComment": "/**\n * Creates a Parser that skips code points while the given predicate returns true.\n *\n * This counts in unicode characters (code points), not UTF-16 code units.\n *\n * This acts like `starConsumed(codepoint(isMatch, []))` if expected is not set, or as `plusConsumed(codepoint(isMatch, expected))` if it is, but is much more efficient than either of those combinations.\n *\n * @param isMatch - callback called for each codepoint, should return whether that matches\n *\n * @param expected - expected strings to return if the first codepoint does not match\n *\n * @public\n */\n",
+          "excerptTokens": [
+            {
+              "kind": "Content",
+              "text": "export declare function codepoints(isMatch: "
+            },
+            {
+              "kind": "Content",
+              "text": "(codepoint: number) => boolean"
+            },
+            {
+              "kind": "Content",
+              "text": ", expected?: "
+            },
+            {
+              "kind": "Content",
+              "text": "string[]"
+            },
+            {
+              "kind": "Content",
+              "text": "): "
+            },
+            {
+              "kind": "Reference",
+              "text": "Parser",
+              "canonicalReference": "prsc!Parser:type"
+            },
+            {
+              "kind": "Content",
+              "text": "<void>"
+            },
+            {
+              "kind": "Content",
+              "text": ";"
+            }
+          ],
+          "returnTypeTokenRange": {
+            "startIndex": 5,
+            "endIndex": 7
+          },
+          "releaseTag": "Public",
+          "overloadIndex": 1,
+          "parameters": [
+            {
+              "parameterName": "isMatch",
+              "parameterTypeTokenRange": {
+                "startIndex": 1,
+                "endIndex": 2
+              }
+            },
+            {
+              "parameterName": "expected",
+              "parameterTypeTokenRange": {
+                "startIndex": 3,
+                "endIndex": 4
+              }
+            }
+          ],
+          "name": "codepoints"
+        },
         {
           "kind": "Function",
           "canonicalReference": "prsc!collect:function(1)",

diff --git a/api/prsc.api.md b/api/prsc.api.md
@@ -7,6 +7,9 @@
 // @public
 export function codepoint(isMatch: (codepoint: number) => boolean, expected: string[]): Parser<void>;
 
+// @public
+export function codepoints(isMatch: (codepoint: number) => boolean, expected?: string[]): Parser<void>;
+
 // @public
 export function collect<T, R>(gen: Generator<T, R>): [T[], R];
 

diff --git a/docs/prsc.codepoint.md b/docs/prsc.codepoint.md
@@ -8,6 +8,8 @@ Creates a Parser that skips the next code point if the given predicate returns t
 
 This counts in unicode characters (code points), not UTF-16 code units.
 
+To match a sequence of code points, consider using `codepoints` instead.
+
 <b>Signature:</b>
 
 ```typescript
@@ -19,7 +21,7 @@ export declare function codepoint(isMatch: (codepoint: number) => boolean, expec
 |  Parameter | Type | Description |
 |  --- | --- | --- |
 |  isMatch | (codepoint: number) =&gt; boolean | callback called with the next codepoint, should return whether that matches |
-|  expected | string\[\] |  |
+|  expected | string\[\] | expected strings to return if the codepoint does not match |
 
 <b>Returns:</b>
 

diff --git a/docs/prsc.codepoints.md b/docs/prsc.codepoints.md
@@ -0,0 +1,29 @@
+<!-- Do not edit this file. It is automatically generated by API Documenter. -->
+
+[Home](./index.md) &gt; [prsc](./prsc.md) &gt; [codepoints](./prsc.codepoints.md)
+
+## codepoints() function
+
+Creates a Parser that skips code points while the given predicate returns true.
+
+This counts in unicode characters (code points), not UTF-16 code units.
+
+This acts like `starConsumed(codepoint(isMatch, []))` if expected is not set, or as `plusConsumed(codepoint(isMatch, expected))` if it is, but is much more efficient than either of those combinations.
+
+<b>Signature:</b>
+
+```typescript
+export declare function codepoints(isMatch: (codepoint: number) => boolean, expected?: string[]): Parser<void>;
+```
+
+## Parameters
+
+|  Parameter | Type | Description |
+|  --- | --- | --- |
+|  isMatch | (codepoint: number) =&gt; boolean | callback called for each codepoint, should return whether that matches |
+|  expected | string\[\] | expected strings to return if the first codepoint does not match |
+
+<b>Returns:</b>
+
+[Parser](./prsc.parser.md)<!-- -->&lt;void&gt;
+
diff --git a/docs/prsc.md b/docs/prsc.md
@@ -8,7 +8,8 @@
 
 |  Function | Description |
 |  --- | --- |
-|  [codepoint(isMatch, expected)](./prsc.codepoint.md) | Creates a Parser that skips the next code point if the given predicate returns true.<!-- -->This counts in unicode characters (code points), not UTF-16 code units. |
+|  [codepoint(isMatch, expected)](./prsc.codepoint.md) | Creates a Parser that skips the next code point if the given predicate returns true.<!-- -->This counts in unicode characters (code points), not UTF-16 code units.<!-- -->To match a sequence of code points, consider using <code>codepoints</code> instead. |
+|  [codepoints(isMatch, expected)](./prsc.codepoints.md) | Creates a Parser that skips code points while the given predicate returns true.<!-- -->This counts in unicode characters (code points), not UTF-16 code units.<!-- -->This acts like <code>starConsumed(codepoint(isMatch, []))</code> if expected is not set, or as <code>plusConsumed(codepoint(isMatch, expected))</code> if it is, but is much more efficient than either of those combinations. |
 |  [collect(gen)](./prsc.collect.md) | Helper to collect both the yielded values and the returned value from a generator. |
 |  [complete(parser)](./prsc.complete.md) | Creates a Parser that applies the given parser and only succeeds (returning the inner parser's result) if parsing concludes at the end of the input string. |
 |  [consume(parser)](./prsc.consume.md) | Creates a Parser that applies the given parser but discards the resulting value. |

diff --git a/src/parser-combinators.ts b/src/parser-combinators.ts
@@ -85,9 +85,12 @@ function lengthFromCodePoint(cp: number): number {
  *
  * This counts in unicode characters (code points), not UTF-16 code units.
  *
+ * To match a sequence of code points, consider using `codepoints` instead.
+ *
  * @public
  *
- * @param isMatch - callback called with the next codepoint, should return whether that matches
+ * @param isMatch  - callback called with the next codepoint, should return whether that matches
+ * @param expected - expected strings to return if the codepoint does not match
  */
 export function codepoint(
 	isMatch: (codepoint: number) => boolean,
@@ -102,6 +105,43 @@ export function codepoint(
 	};
 }
 
+/**
+ * Creates a Parser that skips code points while the given predicate returns true.
+ *
+ * This counts in unicode characters (code points), not UTF-16 code units.
+ *
+ * This acts like `starConsumed(codepoint(isMatch, []))` if expected is not set, or as
+ * `plusConsumed(codepoint(isMatch, expected))` if it is, but is much more efficient than either of
+ * those combinations.
+ *
+ * @public
+ *
+ * @param isMatch  - callback called for each codepoint, should return whether that matches
+ * @param expected - expected strings to return if the first codepoint does not match
+ */
+export function codepoints(
+	isMatch: (codepoint: number) => boolean,
+	expected?: string[]
+): Parser<void> {
+	return (input: string, offset: number) => {
+		const startOffset = offset;
+		while (true) {
+			const cp = input.codePointAt(offset);
+			if (cp === undefined) {
+				break;
+			}
+			if (!isMatch(cp)) {
+				break;
+			}
+			offset += cp > 0xffff ? 2 : 1;
+		}
+		if (expected !== undefined && offset === startOffset) {
+			return error(offset, expected);
+		}
+		return ok(offset);
+	};
+}
+
 /**
  * Creates a Parser that matches a single character from a range of codepoints.
  *

diff --git a/test/parser-combinators.tests.ts b/test/parser-combinators.tests.ts
@@ -23,6 +23,8 @@ import {
 	followed,
 	starConsumed,
 	plusConsumed,
+	codepoint,
+	codepoints,
 } from '../src/parser-combinators';
 
 describe('parser combinators', () => {
@@ -39,6 +41,48 @@ describe('parser combinators', () => {
 		});
 	});
 
+	describe('codepoint', () => {
+		it('skips a codepoint if it matches', () => {
+			expect(codepoint(() => true, [])('a', 0).success).toBe(true);
+			expect(codepoint(() => true, [])('a', 0).offset).toBe(1);
+		});
+
+		it('returns expected if it does not', () => {
+			expect(codepoint(() => false, ['expected'])('a', 0).success).toBe(false);
+			expect(codepoint(() => false, ['expected'])('a', 0).offset).toBe(0);
+			expect((codepoint(() => false, ['expected'])('a', 0) as any).expected).toEqual([
+				'expected',
+			]);
+		});
+	});
+
+	describe('codepoints', () => {
+		it('skips codepoints while they match', () => {
+			const parser = codepoints((cp) => cp === 'a'.codePointAt(0));
+			expect(parser('a', 0).success).toBe(true);
+			expect(parser('a', 0).offset).toBe(1);
+			expect(parser('aaab', 0).success).toBe(true);
+			expect(parser('aaab', 0).offset).toBe(3);
+			expect(parser('b', 0).success).toBe(true);
+			expect(parser('b', 0).offset).toBe(0);
+		});
+
+		it('handles surrogate pairs', () => {
+			const parser = codepoints((cp) => cp > 0x10000);
+			expect(parser('\u{1f4a9}b', 0).success).toBe(true);
+			expect(parser('\u{1f4a9}b', 0).offset).toBe(2);
+		});
+
+		it('needs to match at least one if expected is provided', () => {
+			const parser = codepoints((cp) => cp === 'a'.codePointAt(0), ['expected']);
+			expect(parser('a', 0).success).toBe(true);
+			expect(parser('a', 0).offset).toBe(1);
+			expect(parser('b', 0).success).toBe(false);
+			expect(parser('b', 0).offset).toBe(0);
+			expect((parser('b', 0) as any).expected).toEqual(['expected']);
+		});
+	});
+
 	describe('range', () => {
 		it('accepts one from a range of unicode characters', () => {
 			expect(range('a'.codePointAt(0)!, 'z'.codePointAt(0)!)('q', 0).success).toBe(true);