diff --git a/api/prsc.api.json b/api/prsc.api.json index 92187b3..02e0946 100644 --- a/api/prsc.api.json +++ b/api/prsc.api.json @@ -172,7 +172,7 @@ { "kind": "Function", "canonicalReference": "prsc!codepoint:function(1)", - "docComment": "/**\n * Creates a Parser that skips the next code point if the given predicate returns true.\n *\n * This counts in unicode characters (code points), not UTF-16 code units.\n *\n * @param isMatch - callback called with the next codepoint, should return whether that matches\n *\n * @public\n */\n", + "docComment": "/**\n * Creates a Parser that skips the next code point if the given predicate returns true.\n *\n * This counts in unicode characters (code points), not UTF-16 code units.\n *\n * To match a sequence of code points, consider using `codepoints` instead.\n *\n * @param isMatch - callback called with the next codepoint, should return whether that matches\n *\n * @param expected - expected strings to return if the codepoint does not match\n *\n * @public\n */\n", "excerptTokens": [ { "kind": "Content", @@ -232,6 +232,69 @@ ], "name": "codepoint" }, + { + "kind": "Function", + "canonicalReference": "prsc!codepoints:function(1)", + "docComment": "/**\n * Creates a Parser that skips code points while the given predicate returns true.\n *\n * This counts in unicode characters (code points), not UTF-16 code units.\n *\n * This acts like `starConsumed(codepoint(isMatch, []))` if expected is not set, or as `plusConsumed(codepoint(isMatch, expected))` if it is, but is much more efficient than either of those combinations.\n *\n * @param isMatch - callback called for each codepoint, should return whether that matches\n *\n * @param expected - expected strings to return if the first codepoint does not match\n *\n * @public\n */\n", + "excerptTokens": [ + { + "kind": "Content", + "text": "export declare function codepoints(isMatch: " + }, + { + "kind": "Content", + "text": "(codepoint: number) => boolean" + }, + { + "kind": "Content", + "text": ", expected?: " + }, + { + "kind": "Content", + "text": "string[]" + }, + { + "kind": "Content", + "text": "): " + }, + { + "kind": "Reference", + "text": "Parser", + "canonicalReference": "prsc!Parser:type" + }, + { + "kind": "Content", + "text": "" + }, + { + "kind": "Content", + "text": ";" + } + ], + "returnTypeTokenRange": { + "startIndex": 5, + "endIndex": 7 + }, + "releaseTag": "Public", + "overloadIndex": 1, + "parameters": [ + { + "parameterName": "isMatch", + "parameterTypeTokenRange": { + "startIndex": 1, + "endIndex": 2 + } + }, + { + "parameterName": "expected", + "parameterTypeTokenRange": { + "startIndex": 3, + "endIndex": 4 + } + } + ], + "name": "codepoints" + }, { "kind": "Function", "canonicalReference": "prsc!collect:function(1)", diff --git a/api/prsc.api.md b/api/prsc.api.md index 6e45dbf..eb04314 100644 --- a/api/prsc.api.md +++ b/api/prsc.api.md @@ -7,6 +7,9 @@ // @public export function codepoint(isMatch: (codepoint: number) => boolean, expected: string[]): Parser; +// @public +export function codepoints(isMatch: (codepoint: number) => boolean, expected?: string[]): Parser; + // @public export function collect(gen: Generator): [T[], R]; diff --git a/docs/prsc.codepoint.md b/docs/prsc.codepoint.md index 69ad2f5..c22a2dc 100644 --- a/docs/prsc.codepoint.md +++ b/docs/prsc.codepoint.md @@ -8,6 +8,8 @@ Creates a Parser that skips the next code point if the given predicate returns t This counts in unicode characters (code points), not UTF-16 code units. +To match a sequence of code points, consider using `codepoints` instead. + Signature: ```typescript @@ -19,7 +21,7 @@ export declare function codepoint(isMatch: (codepoint: number) => boolean, expec | Parameter | Type | Description | | --- | --- | --- | | isMatch | (codepoint: number) => boolean | callback called with the next codepoint, should return whether that matches | -| expected | string\[\] | | +| expected | string\[\] | expected strings to return if the codepoint does not match | Returns: diff --git a/docs/prsc.codepoints.md b/docs/prsc.codepoints.md new file mode 100644 index 0000000..d855193 --- /dev/null +++ b/docs/prsc.codepoints.md @@ -0,0 +1,29 @@ + + +[Home](./index.md) > [prsc](./prsc.md) > [codepoints](./prsc.codepoints.md) + +## codepoints() function + +Creates a Parser that skips code points while the given predicate returns true. + +This counts in unicode characters (code points), not UTF-16 code units. + +This acts like `starConsumed(codepoint(isMatch, []))` if expected is not set, or as `plusConsumed(codepoint(isMatch, expected))` if it is, but is much more efficient than either of those combinations. + +Signature: + +```typescript +export declare function codepoints(isMatch: (codepoint: number) => boolean, expected?: string[]): Parser; +``` + +## Parameters + +| Parameter | Type | Description | +| --- | --- | --- | +| isMatch | (codepoint: number) => boolean | callback called for each codepoint, should return whether that matches | +| expected | string\[\] | expected strings to return if the first codepoint does not match | + +Returns: + +[Parser](./prsc.parser.md)<void> + diff --git a/docs/prsc.md b/docs/prsc.md index ce52606..d3a189f 100644 --- a/docs/prsc.md +++ b/docs/prsc.md @@ -8,7 +8,8 @@ | Function | Description | | --- | --- | -| [codepoint(isMatch, expected)](./prsc.codepoint.md) | Creates a Parser that skips the next code point if the given predicate returns true.This counts in unicode characters (code points), not UTF-16 code units. | +| [codepoint(isMatch, expected)](./prsc.codepoint.md) | Creates a Parser that skips the next code point if the given predicate returns true.This counts in unicode characters (code points), not UTF-16 code units.To match a sequence of code points, consider using codepoints instead. | +| [codepoints(isMatch, expected)](./prsc.codepoints.md) | Creates a Parser that skips code points while the given predicate returns true.This counts in unicode characters (code points), not UTF-16 code units.This acts like starConsumed(codepoint(isMatch, [])) if expected is not set, or as plusConsumed(codepoint(isMatch, expected)) if it is, but is much more efficient than either of those combinations. | | [collect(gen)](./prsc.collect.md) | Helper to collect both the yielded values and the returned value from a generator. | | [complete(parser)](./prsc.complete.md) | Creates a Parser that applies the given parser and only succeeds (returning the inner parser's result) if parsing concludes at the end of the input string. | | [consume(parser)](./prsc.consume.md) | Creates a Parser that applies the given parser but discards the resulting value. | diff --git a/src/parser-combinators.ts b/src/parser-combinators.ts index a224bfa..a39fd7c 100644 --- a/src/parser-combinators.ts +++ b/src/parser-combinators.ts @@ -85,9 +85,12 @@ function lengthFromCodePoint(cp: number): number { * * This counts in unicode characters (code points), not UTF-16 code units. * + * To match a sequence of code points, consider using `codepoints` instead. + * * @public * - * @param isMatch - callback called with the next codepoint, should return whether that matches + * @param isMatch - callback called with the next codepoint, should return whether that matches + * @param expected - expected strings to return if the codepoint does not match */ export function codepoint( isMatch: (codepoint: number) => boolean, @@ -102,6 +105,43 @@ export function codepoint( }; } +/** + * Creates a Parser that skips code points while the given predicate returns true. + * + * This counts in unicode characters (code points), not UTF-16 code units. + * + * This acts like `starConsumed(codepoint(isMatch, []))` if expected is not set, or as + * `plusConsumed(codepoint(isMatch, expected))` if it is, but is much more efficient than either of + * those combinations. + * + * @public + * + * @param isMatch - callback called for each codepoint, should return whether that matches + * @param expected - expected strings to return if the first codepoint does not match + */ +export function codepoints( + isMatch: (codepoint: number) => boolean, + expected?: string[] +): Parser { + return (input: string, offset: number) => { + const startOffset = offset; + while (true) { + const cp = input.codePointAt(offset); + if (cp === undefined) { + break; + } + if (!isMatch(cp)) { + break; + } + offset += cp > 0xffff ? 2 : 1; + } + if (expected !== undefined && offset === startOffset) { + return error(offset, expected); + } + return ok(offset); + }; +} + /** * Creates a Parser that matches a single character from a range of codepoints. * diff --git a/test/parser-combinators.tests.ts b/test/parser-combinators.tests.ts index b684b92..fbe9e96 100644 --- a/test/parser-combinators.tests.ts +++ b/test/parser-combinators.tests.ts @@ -23,6 +23,8 @@ import { followed, starConsumed, plusConsumed, + codepoint, + codepoints, } from '../src/parser-combinators'; describe('parser combinators', () => { @@ -39,6 +41,48 @@ describe('parser combinators', () => { }); }); + describe('codepoint', () => { + it('skips a codepoint if it matches', () => { + expect(codepoint(() => true, [])('a', 0).success).toBe(true); + expect(codepoint(() => true, [])('a', 0).offset).toBe(1); + }); + + it('returns expected if it does not', () => { + expect(codepoint(() => false, ['expected'])('a', 0).success).toBe(false); + expect(codepoint(() => false, ['expected'])('a', 0).offset).toBe(0); + expect((codepoint(() => false, ['expected'])('a', 0) as any).expected).toEqual([ + 'expected', + ]); + }); + }); + + describe('codepoints', () => { + it('skips codepoints while they match', () => { + const parser = codepoints((cp) => cp === 'a'.codePointAt(0)); + expect(parser('a', 0).success).toBe(true); + expect(parser('a', 0).offset).toBe(1); + expect(parser('aaab', 0).success).toBe(true); + expect(parser('aaab', 0).offset).toBe(3); + expect(parser('b', 0).success).toBe(true); + expect(parser('b', 0).offset).toBe(0); + }); + + it('handles surrogate pairs', () => { + const parser = codepoints((cp) => cp > 0x10000); + expect(parser('\u{1f4a9}b', 0).success).toBe(true); + expect(parser('\u{1f4a9}b', 0).offset).toBe(2); + }); + + it('needs to match at least one if expected is provided', () => { + const parser = codepoints((cp) => cp === 'a'.codePointAt(0), ['expected']); + expect(parser('a', 0).success).toBe(true); + expect(parser('a', 0).offset).toBe(1); + expect(parser('b', 0).success).toBe(false); + expect(parser('b', 0).offset).toBe(0); + expect((parser('b', 0) as any).expected).toEqual(['expected']); + }); + }); + describe('range', () => { it('accepts one from a range of unicode characters', () => { expect(range('a'.codePointAt(0)!, 'z'.codePointAt(0)!)('q', 0).success).toBe(true);