Support optional 'Script=' prefix (from ES2018 syntax) for Unicode sc…

…ript tokens (#225)
slevithan · Jan 18, 2021 · bb35ead · bb35ead
1 parent 4860122
commit bb35ead
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 14 deletions.
diff --git a/src/addons/unicode-base.js b/src/addons/unicode-base.js
@@ -26,6 +26,7 @@ export default (XRegExp) => {
 
     // Storage for Unicode data
     const unicode = {};
+    const unicodeTypes = {};
 
     // Reuse utils
     const dec = XRegExp._dec;
@@ -123,41 +124,56 @@ export default (XRegExp) => {
      */
     XRegExp.addToken(
         // Use `*` instead of `+` to avoid capturing `^` as the token name in `\p{^}`
-        /\\([pP])(?:{(\^?)([^}]*)}|([A-Za-z]))/,
+        /\\([pP])(?:{(\^?)(?:(Script|sc)=)?([^}]*)}|([A-Za-z]))/,
         (match, scope, flags) => {
             const ERR_DOUBLE_NEG = 'Invalid double negation ';
             const ERR_UNKNOWN_NAME = 'Unknown Unicode token ';
             const ERR_UNKNOWN_REF = 'Unicode token missing data ';
             const ERR_ASTRAL_ONLY = 'Astral mode required for Unicode token ';
             const ERR_ASTRAL_IN_CLASS = 'Astral mode does not support Unicode tokens within character classes';
+            const [
+                fullToken,
+                pPrefix,
+                caretNegation,
+                typePrefix,
+                tokenName,
+                tokenSingleCharName
+            ] = match;
             // Negated via \P{..} or \p{^..}
-            let isNegated = match[1] === 'P' || !!match[2];
+            let isNegated = pPrefix === 'P' || !!caretNegation;
             // Switch from BMP (0-FFFF) to astral (0-10FFFF) mode via flag A
             const isAstralMode = flags.includes('A');
-            // Token lookup name. Check `[4]` first to avoid passing `undefined` via `\p{}`
-            let slug = normalize(match[4] || match[3]);
+            // Token lookup name. Check `tokenSingleCharName` first to avoid passing `undefined`
+            // via `\p{}`
+            let slug = normalize(tokenSingleCharName || tokenName);
             // Token data object
             let item = unicode[slug];
 
-            if (match[1] === 'P' && match[2]) {
-                throw new SyntaxError(ERR_DOUBLE_NEG + match[0]);
+            if (pPrefix === 'P' && caretNegation) {
+                throw new SyntaxError(ERR_DOUBLE_NEG + fullToken);
             }
             if (!unicode.hasOwnProperty(slug)) {
-                throw new SyntaxError(ERR_UNKNOWN_NAME + match[0]);
+                throw new SyntaxError(ERR_UNKNOWN_NAME + fullToken);
+            }
+
+            if (typePrefix) {
+                if (!(unicodeTypes[typePrefix] && unicodeTypes[typePrefix][slug])) {
+                    throw new SyntaxError(ERR_UNKNOWN_NAME + fullToken);
+                }
             }
 
             // Switch to the negated form of the referenced Unicode token
             if (item.inverseOf) {
                 slug = normalize(item.inverseOf);
                 if (!unicode.hasOwnProperty(slug)) {
-                    throw new ReferenceError(`${ERR_UNKNOWN_REF + match[0]} -> ${item.inverseOf}`);
+                    throw new ReferenceError(`${ERR_UNKNOWN_REF + fullToken} -> ${item.inverseOf}`);
                 }
                 item = unicode[slug];
                 isNegated = !isNegated;
             }
 
             if (!(item.bmp || isAstralMode)) {
-                throw new SyntaxError(ERR_ASTRAL_ONLY + match[0]);
+                throw new SyntaxError(ERR_ASTRAL_ONLY + fullToken);
             }
             if (isAstralMode) {
                 if (scope === 'class') {
@@ -196,6 +212,9 @@ export default (XRegExp) => {
      *   character classes and alternation, and should use surrogate pairs to represent astral code
      *   points. `inverseOf` can be used to avoid duplicating character data if a Unicode token is
      *   defined as the exact inverse of another token.
+     * @param {String} [typePrefix] Enables optionally using this type as a prefix for all of the
+     *   provided Unicode tokens, e.g. if given `'Type'`, then `\p{TokenName}` can also be written
+     *   as `\p{Type=TokenName}`.
      * @example
      *
      * // Basic use
@@ -206,20 +225,35 @@ export default (XRegExp) => {
      * }]);
      * XRegExp('\\p{XDigit}:\\p{Hexadecimal}+').test('0:3D'); // -> true
      */
-    XRegExp.addUnicodeData = (data) => {
+    XRegExp.addUnicodeData = (data, typePrefix) => {
         const ERR_NO_NAME = 'Unicode token requires name';
         const ERR_NO_DATA = 'Unicode token has no character data ';
 
+        if (typePrefix) {
+            // Case sensitive to match ES2018
+            unicodeTypes[typePrefix] = {};
+        }
+
         for (const item of data) {
             if (!item.name) {
                 throw new Error(ERR_NO_NAME);
             }
             if (!(item.inverseOf || item.bmp || item.astral)) {
                 throw new Error(ERR_NO_DATA + item.name);
             }
-            unicode[normalize(item.name)] = item;
+
+            const normalizedName = normalize(item.name);
+            unicode[normalizedName] = item;
+            if (typePrefix) {
+                unicodeTypes[typePrefix][normalizedName] = true;
+            }
+
             if (item.alias) {
-                unicode[normalize(item.alias)] = item;
+                const normalizedAlias = normalize(item.alias);
+                unicode[normalizedAlias] = item;
+                if (typePrefix) {
+                    unicodeTypes[typePrefix][normalizedAlias] = true;
+                }
             }
         }
 

diff --git a/src/addons/unicode-scripts.js b/src/addons/unicode-scripts.js
@@ -22,5 +22,5 @@ export default (XRegExp) => {
         throw new ReferenceError('Unicode Base must be loaded before Unicode Scripts');
     }
 
-    XRegExp.addUnicodeData(scripts);
+    XRegExp.addUnicodeData(scripts, 'Script');
 };
diff --git a/tests/spec/s-addons-unicode.js b/tests/spec/s-addons-unicode.js
@@ -417,6 +417,10 @@ describe('Unicode Categories addon:', function() {
         expect(function() {XRegExp('\\p{IsP}');}).toThrowError(SyntaxError);
     });
 
+    it('should not allow the "Script=" prefix for category names', function() {
+        expect(function() {XRegExp('\\p{Script=P}');}).toThrowError(SyntaxError);
+    });
+
     it('should handle \\p{Cn}', function() {
         testUnicodeToken('Cn', {
             invalid: ['\u20BA']
@@ -489,6 +493,10 @@ describe('Unicode Properties addon:', function() {
         expect(function() {XRegExp('\\p{IsASCII}');}).toThrowError(SyntaxError);
     });
 
+    it('should not allow the "Script=" prefix for property names', function() {
+        expect(function() {XRegExp('\\p{Script=ASCII}');}).toThrowError(SyntaxError);
+    });
+
     it('should handle \\p{Alphabetic}', function() {
         testUnicodeToken('Alphabetic', {
             valid: ['A', 'a', 'Å', 'å', '日', 'ي'],
@@ -529,6 +537,21 @@ describe('Unicode Scripts addon:', function() {
         expect(function() {XRegExp('\\p{IsLatin}');}).toThrowError(SyntaxError);
     });
 
+    it('should allow the "Script=" prefix for script names', function() {
+        expect(function() {XRegExp('\\p{Script=Latin}');}).not.toThrow();
+        testUnicodeToken('Script=Latin', {
+            valid: ['A', 'B', 'C'],
+            invalid: ['カ', 'タ', 'ナ']
+        });
+    });
+
+    it('should handle \\p{Latin}', function() {
+        testUnicodeToken('Latin', {
+            valid: ['A', 'B', 'C'],
+            invalid: ['カ', 'タ', 'ナ']
+        });
+    });
+
     it('should handle \\p{Katakana}', function() {
         testUnicodeToken('Katakana', {
             valid: ['カ', 'タ', 'ナ'],

diff --git a/types/index.d.ts b/types/index.d.ts
@@ -497,6 +497,9 @@ declare namespace XRegExp {
      *   character classes and alternation, and should use surrogate pairs to represent astral code
      *   points. `inverseOf` can be used to avoid duplicating character data if a Unicode token is
      *   defined as the exact inverse of another token.
+     * @param typePrefix - Enables optionally using this type as a prefix for all of the
+     *   provided Unicode tokens, e.g. if given `'Type'`, then `\p{TokenName}` can also be written
+     *   as `\p{Type=TokenName}`.
      * @example
      *
      * // Basic use
@@ -507,7 +510,7 @@ declare namespace XRegExp {
      * }]);
      * XRegExp('\\p{XDigit}:\\p{Hexadecimal}+').test('0:3D'); // -> true
      */
-    function addUnicodeData(data: UnicodeCharacterRange[]): void;
+    function addUnicodeData(data: UnicodeCharacterRange[], typePrefix?: string): void;
 
     /**
      * Builds regexes using named subpatterns, for readability and pattern reuse. Backreferences in