Skip to content

Commit

Permalink
Add codepoints parser
Browse files Browse the repository at this point in the history
  • Loading branch information
bwrrp committed Mar 31, 2022
1 parent ec91556 commit 04025af
Show file tree
Hide file tree
Showing 7 changed files with 186 additions and 4 deletions.
65 changes: 64 additions & 1 deletion api/prsc.api.json
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@
{
"kind": "Function",
"canonicalReference": "prsc!codepoint:function(1)",
"docComment": "/**\n * Creates a Parser that skips the next code point if the given predicate returns true.\n *\n * This counts in unicode characters (code points), not UTF-16 code units.\n *\n * @param isMatch - callback called with the next codepoint, should return whether that matches\n *\n * @public\n */\n",
"docComment": "/**\n * Creates a Parser that skips the next code point if the given predicate returns true.\n *\n * This counts in unicode characters (code points), not UTF-16 code units.\n *\n * To match a sequence of code points, consider using `codepoints` instead.\n *\n * @param isMatch - callback called with the next codepoint, should return whether that matches\n *\n * @param expected - expected strings to return if the codepoint does not match\n *\n * @public\n */\n",
"excerptTokens": [
{
"kind": "Content",
Expand Down Expand Up @@ -232,6 +232,69 @@
],
"name": "codepoint"
},
{
"kind": "Function",
"canonicalReference": "prsc!codepoints:function(1)",
"docComment": "/**\n * Creates a Parser that skips code points while the given predicate returns true.\n *\n * This counts in unicode characters (code points), not UTF-16 code units.\n *\n * This acts like `starConsumed(codepoint(isMatch, []))` if expected is not set, or as `plusConsumed(codepoint(isMatch, expected))` if it is, but is much more efficient than either of those combinations.\n *\n * @param isMatch - callback called for each codepoint, should return whether that matches\n *\n * @param expected - expected strings to return if the first codepoint does not match\n *\n * @public\n */\n",
"excerptTokens": [
{
"kind": "Content",
"text": "export declare function codepoints(isMatch: "
},
{
"kind": "Content",
"text": "(codepoint: number) => boolean"
},
{
"kind": "Content",
"text": ", expected?: "
},
{
"kind": "Content",
"text": "string[]"
},
{
"kind": "Content",
"text": "): "
},
{
"kind": "Reference",
"text": "Parser",
"canonicalReference": "prsc!Parser:type"
},
{
"kind": "Content",
"text": "<void>"
},
{
"kind": "Content",
"text": ";"
}
],
"returnTypeTokenRange": {
"startIndex": 5,
"endIndex": 7
},
"releaseTag": "Public",
"overloadIndex": 1,
"parameters": [
{
"parameterName": "isMatch",
"parameterTypeTokenRange": {
"startIndex": 1,
"endIndex": 2
}
},
{
"parameterName": "expected",
"parameterTypeTokenRange": {
"startIndex": 3,
"endIndex": 4
}
}
],
"name": "codepoints"
},
{
"kind": "Function",
"canonicalReference": "prsc!collect:function(1)",
Expand Down
3 changes: 3 additions & 0 deletions api/prsc.api.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
// @public
export function codepoint(isMatch: (codepoint: number) => boolean, expected: string[]): Parser<void>;

// @public
export function codepoints(isMatch: (codepoint: number) => boolean, expected?: string[]): Parser<void>;

// @public
export function collect<T, R>(gen: Generator<T, R>): [T[], R];

Expand Down
4 changes: 3 additions & 1 deletion docs/prsc.codepoint.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ Creates a Parser that skips the next code point if the given predicate returns t

This counts in unicode characters (code points), not UTF-16 code units.

To match a sequence of code points, consider using `codepoints` instead.

<b>Signature:</b>

```typescript
Expand All @@ -19,7 +21,7 @@ export declare function codepoint(isMatch: (codepoint: number) => boolean, expec
| Parameter | Type | Description |
| --- | --- | --- |
| isMatch | (codepoint: number) =&gt; boolean | callback called with the next codepoint, should return whether that matches |
| expected | string\[\] | |
| expected | string\[\] | expected strings to return if the codepoint does not match |

<b>Returns:</b>

Expand Down
29 changes: 29 additions & 0 deletions docs/prsc.codepoints.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
<!-- Do not edit this file. It is automatically generated by API Documenter. -->

[Home](./index.md) &gt; [prsc](./prsc.md) &gt; [codepoints](./prsc.codepoints.md)

## codepoints() function

Creates a Parser that skips code points while the given predicate returns true.

This counts in unicode characters (code points), not UTF-16 code units.

This acts like `starConsumed(codepoint(isMatch, []))` if expected is not set, or as `plusConsumed(codepoint(isMatch, expected))` if it is, but is much more efficient than either of those combinations.

<b>Signature:</b>

```typescript
export declare function codepoints(isMatch: (codepoint: number) => boolean, expected?: string[]): Parser<void>;
```

## Parameters

| Parameter | Type | Description |
| --- | --- | --- |
| isMatch | (codepoint: number) =&gt; boolean | callback called for each codepoint, should return whether that matches |
| expected | string\[\] | expected strings to return if the first codepoint does not match |

<b>Returns:</b>

[Parser](./prsc.parser.md)<!-- -->&lt;void&gt;

3 changes: 2 additions & 1 deletion docs/prsc.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@

| Function | Description |
| --- | --- |
| [codepoint(isMatch, expected)](./prsc.codepoint.md) | Creates a Parser that skips the next code point if the given predicate returns true.<!-- -->This counts in unicode characters (code points), not UTF-16 code units. |
| [codepoint(isMatch, expected)](./prsc.codepoint.md) | Creates a Parser that skips the next code point if the given predicate returns true.<!-- -->This counts in unicode characters (code points), not UTF-16 code units.<!-- -->To match a sequence of code points, consider using <code>codepoints</code> instead. |
| [codepoints(isMatch, expected)](./prsc.codepoints.md) | Creates a Parser that skips code points while the given predicate returns true.<!-- -->This counts in unicode characters (code points), not UTF-16 code units.<!-- -->This acts like <code>starConsumed(codepoint(isMatch, []))</code> if expected is not set, or as <code>plusConsumed(codepoint(isMatch, expected))</code> if it is, but is much more efficient than either of those combinations. |
| [collect(gen)](./prsc.collect.md) | Helper to collect both the yielded values and the returned value from a generator. |
| [complete(parser)](./prsc.complete.md) | Creates a Parser that applies the given parser and only succeeds (returning the inner parser's result) if parsing concludes at the end of the input string. |
| [consume(parser)](./prsc.consume.md) | Creates a Parser that applies the given parser but discards the resulting value. |
Expand Down
42 changes: 41 additions & 1 deletion src/parser-combinators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,12 @@ function lengthFromCodePoint(cp: number): number {
*
* This counts in unicode characters (code points), not UTF-16 code units.
*
* To match a sequence of code points, consider using `codepoints` instead.
*
* @public
*
* @param isMatch - callback called with the next codepoint, should return whether that matches
* @param isMatch - callback called with the next codepoint, should return whether that matches
* @param expected - expected strings to return if the codepoint does not match
*/
export function codepoint(
isMatch: (codepoint: number) => boolean,
Expand All @@ -102,6 +105,43 @@ export function codepoint(
};
}

/**
* Creates a Parser that skips code points while the given predicate returns true.
*
* This counts in unicode characters (code points), not UTF-16 code units.
*
* This acts like `starConsumed(codepoint(isMatch, []))` if expected is not set, or as
* `plusConsumed(codepoint(isMatch, expected))` if it is, but is much more efficient than either of
* those combinations.
*
* @public
*
* @param isMatch - callback called for each codepoint, should return whether that matches
* @param expected - expected strings to return if the first codepoint does not match
*/
export function codepoints(
isMatch: (codepoint: number) => boolean,
expected?: string[]
): Parser<void> {
return (input: string, offset: number) => {
const startOffset = offset;
while (true) {
const cp = input.codePointAt(offset);
if (cp === undefined) {
break;
}
if (!isMatch(cp)) {
break;
}
offset += cp > 0xffff ? 2 : 1;
}
if (expected !== undefined && offset === startOffset) {
return error(offset, expected);
}
return ok(offset);
};
}

/**
* Creates a Parser that matches a single character from a range of codepoints.
*
Expand Down
44 changes: 44 additions & 0 deletions test/parser-combinators.tests.ts
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ import {
followed,
starConsumed,
plusConsumed,
codepoint,
codepoints,
} from '../src/parser-combinators';

describe('parser combinators', () => {
Expand All @@ -39,6 +41,48 @@ describe('parser combinators', () => {
});
});

describe('codepoint', () => {
it('skips a codepoint if it matches', () => {
expect(codepoint(() => true, [])('a', 0).success).toBe(true);
expect(codepoint(() => true, [])('a', 0).offset).toBe(1);
});

it('returns expected if it does not', () => {
expect(codepoint(() => false, ['expected'])('a', 0).success).toBe(false);
expect(codepoint(() => false, ['expected'])('a', 0).offset).toBe(0);
expect((codepoint(() => false, ['expected'])('a', 0) as any).expected).toEqual([
'expected',
]);
});
});

describe('codepoints', () => {
it('skips codepoints while they match', () => {
const parser = codepoints((cp) => cp === 'a'.codePointAt(0));
expect(parser('a', 0).success).toBe(true);
expect(parser('a', 0).offset).toBe(1);
expect(parser('aaab', 0).success).toBe(true);
expect(parser('aaab', 0).offset).toBe(3);
expect(parser('b', 0).success).toBe(true);
expect(parser('b', 0).offset).toBe(0);
});

it('handles surrogate pairs', () => {
const parser = codepoints((cp) => cp > 0x10000);
expect(parser('\u{1f4a9}b', 0).success).toBe(true);
expect(parser('\u{1f4a9}b', 0).offset).toBe(2);
});

it('needs to match at least one if expected is provided', () => {
const parser = codepoints((cp) => cp === 'a'.codePointAt(0), ['expected']);
expect(parser('a', 0).success).toBe(true);
expect(parser('a', 0).offset).toBe(1);
expect(parser('b', 0).success).toBe(false);
expect(parser('b', 0).offset).toBe(0);
expect((parser('b', 0) as any).expected).toEqual(['expected']);
});
});

describe('range', () => {
it('accepts one from a range of unicode characters', () => {
expect(range('a'.codePointAt(0)!, 'z'.codePointAt(0)!)('q', 0).success).toBe(true);
Expand Down

0 comments on commit 04025af

Please sign in to comment.