Skip to content

Commit

Permalink
Support optional 'Script=' prefix (from ES2018 syntax) for Unicode sc…
Browse files Browse the repository at this point in the history
…ript tokens (#225)
  • Loading branch information
slevithan committed Jan 18, 2021
1 parent 4860122 commit bb35ead
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 14 deletions.
58 changes: 46 additions & 12 deletions src/addons/unicode-base.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ export default (XRegExp) => {

// Storage for Unicode data
const unicode = {};
const unicodeTypes = {};

// Reuse utils
const dec = XRegExp._dec;
Expand Down Expand Up @@ -123,41 +124,56 @@ export default (XRegExp) => {
*/
XRegExp.addToken(
// Use `*` instead of `+` to avoid capturing `^` as the token name in `\p{^}`
/\\([pP])(?:{(\^?)([^}]*)}|([A-Za-z]))/,
/\\([pP])(?:{(\^?)(?:(Script|sc)=)?([^}]*)}|([A-Za-z]))/,
(match, scope, flags) => {
const ERR_DOUBLE_NEG = 'Invalid double negation ';
const ERR_UNKNOWN_NAME = 'Unknown Unicode token ';
const ERR_UNKNOWN_REF = 'Unicode token missing data ';
const ERR_ASTRAL_ONLY = 'Astral mode required for Unicode token ';
const ERR_ASTRAL_IN_CLASS = 'Astral mode does not support Unicode tokens within character classes';
const [
fullToken,
pPrefix,
caretNegation,
typePrefix,
tokenName,
tokenSingleCharName
] = match;
// Negated via \P{..} or \p{^..}
let isNegated = match[1] === 'P' || !!match[2];
let isNegated = pPrefix === 'P' || !!caretNegation;
// Switch from BMP (0-FFFF) to astral (0-10FFFF) mode via flag A
const isAstralMode = flags.includes('A');
// Token lookup name. Check `[4]` first to avoid passing `undefined` via `\p{}`
let slug = normalize(match[4] || match[3]);
// Token lookup name. Check `tokenSingleCharName` first to avoid passing `undefined`
// via `\p{}`
let slug = normalize(tokenSingleCharName || tokenName);
// Token data object
let item = unicode[slug];

if (match[1] === 'P' && match[2]) {
throw new SyntaxError(ERR_DOUBLE_NEG + match[0]);
if (pPrefix === 'P' && caretNegation) {
throw new SyntaxError(ERR_DOUBLE_NEG + fullToken);
}
if (!unicode.hasOwnProperty(slug)) {
throw new SyntaxError(ERR_UNKNOWN_NAME + match[0]);
throw new SyntaxError(ERR_UNKNOWN_NAME + fullToken);
}

if (typePrefix) {
if (!(unicodeTypes[typePrefix] && unicodeTypes[typePrefix][slug])) {
throw new SyntaxError(ERR_UNKNOWN_NAME + fullToken);
}
}

// Switch to the negated form of the referenced Unicode token
if (item.inverseOf) {
slug = normalize(item.inverseOf);
if (!unicode.hasOwnProperty(slug)) {
throw new ReferenceError(`${ERR_UNKNOWN_REF + match[0]} -> ${item.inverseOf}`);
throw new ReferenceError(`${ERR_UNKNOWN_REF + fullToken} -> ${item.inverseOf}`);
}
item = unicode[slug];
isNegated = !isNegated;
}

if (!(item.bmp || isAstralMode)) {
throw new SyntaxError(ERR_ASTRAL_ONLY + match[0]);
throw new SyntaxError(ERR_ASTRAL_ONLY + fullToken);
}
if (isAstralMode) {
if (scope === 'class') {
Expand Down Expand Up @@ -196,6 +212,9 @@ export default (XRegExp) => {
* character classes and alternation, and should use surrogate pairs to represent astral code
* points. `inverseOf` can be used to avoid duplicating character data if a Unicode token is
* defined as the exact inverse of another token.
* @param {String} [typePrefix] Enables optionally using this type as a prefix for all of the
* provided Unicode tokens, e.g. if given `'Type'`, then `\p{TokenName}` can also be written
* as `\p{Type=TokenName}`.
* @example
*
* // Basic use
Expand All @@ -206,20 +225,35 @@ export default (XRegExp) => {
* }]);
* XRegExp('\\p{XDigit}:\\p{Hexadecimal}+').test('0:3D'); // -> true
*/
XRegExp.addUnicodeData = (data) => {
XRegExp.addUnicodeData = (data, typePrefix) => {
const ERR_NO_NAME = 'Unicode token requires name';
const ERR_NO_DATA = 'Unicode token has no character data ';

if (typePrefix) {
// Case sensitive to match ES2018
unicodeTypes[typePrefix] = {};
}

for (const item of data) {
if (!item.name) {
throw new Error(ERR_NO_NAME);
}
if (!(item.inverseOf || item.bmp || item.astral)) {
throw new Error(ERR_NO_DATA + item.name);
}
unicode[normalize(item.name)] = item;

const normalizedName = normalize(item.name);
unicode[normalizedName] = item;
if (typePrefix) {
unicodeTypes[typePrefix][normalizedName] = true;
}

if (item.alias) {
unicode[normalize(item.alias)] = item;
const normalizedAlias = normalize(item.alias);
unicode[normalizedAlias] = item;
if (typePrefix) {
unicodeTypes[typePrefix][normalizedAlias] = true;
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/addons/unicode-scripts.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,5 @@ export default (XRegExp) => {
throw new ReferenceError('Unicode Base must be loaded before Unicode Scripts');
}

XRegExp.addUnicodeData(scripts);
XRegExp.addUnicodeData(scripts, 'Script');
};
23 changes: 23 additions & 0 deletions tests/spec/s-addons-unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,10 @@ describe('Unicode Categories addon:', function() {
expect(function() {XRegExp('\\p{IsP}');}).toThrowError(SyntaxError);
});

it('should not allow the "Script=" prefix for category names', function() {
expect(function() {XRegExp('\\p{Script=P}');}).toThrowError(SyntaxError);
});

it('should handle \\p{Cn}', function() {
testUnicodeToken('Cn', {
invalid: ['\u20BA']
Expand Down Expand Up @@ -489,6 +493,10 @@ describe('Unicode Properties addon:', function() {
expect(function() {XRegExp('\\p{IsASCII}');}).toThrowError(SyntaxError);
});

it('should not allow the "Script=" prefix for property names', function() {
expect(function() {XRegExp('\\p{Script=ASCII}');}).toThrowError(SyntaxError);
});

it('should handle \\p{Alphabetic}', function() {
testUnicodeToken('Alphabetic', {
valid: ['A', 'a', 'Å', 'å', '日', 'ي'],
Expand Down Expand Up @@ -529,6 +537,21 @@ describe('Unicode Scripts addon:', function() {
expect(function() {XRegExp('\\p{IsLatin}');}).toThrowError(SyntaxError);
});

it('should allow the "Script=" prefix for script names', function() {
expect(function() {XRegExp('\\p{Script=Latin}');}).not.toThrow();
testUnicodeToken('Script=Latin', {
valid: ['A', 'B', 'C'],
invalid: ['カ', 'タ', 'ナ']
});
});

it('should handle \\p{Latin}', function() {
testUnicodeToken('Latin', {
valid: ['A', 'B', 'C'],
invalid: ['カ', 'タ', 'ナ']
});
});

it('should handle \\p{Katakana}', function() {
testUnicodeToken('Katakana', {
valid: ['カ', 'タ', 'ナ'],
Expand Down
5 changes: 4 additions & 1 deletion types/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,9 @@ declare namespace XRegExp {
* character classes and alternation, and should use surrogate pairs to represent astral code
* points. `inverseOf` can be used to avoid duplicating character data if a Unicode token is
* defined as the exact inverse of another token.
* @param typePrefix - Enables optionally using this type as a prefix for all of the
* provided Unicode tokens, e.g. if given `'Type'`, then `\p{TokenName}` can also be written
* as `\p{Type=TokenName}`.
* @example
*
* // Basic use
Expand All @@ -507,7 +510,7 @@ declare namespace XRegExp {
* }]);
* XRegExp('\\p{XDigit}:\\p{Hexadecimal}+').test('0:3D'); // -> true
*/
function addUnicodeData(data: UnicodeCharacterRange[]): void;
function addUnicodeData(data: UnicodeCharacterRange[], typePrefix?: string): void;

/**
* Builds regexes using named subpatterns, for readability and pattern reuse. Backreferences in
Expand Down

0 comments on commit bb35ead

Please sign in to comment.