Migrate to Java: PredefinedPosixCharSets

marianobarrios · Apr 7, 2023 · 46e94d9 · 46e94d9
1 parent 7782b8e
commit 46e94d9
Show file tree

Hide file tree

Showing 8 changed files with 154 additions and 88 deletions.
diff --git a/src/main/scala/dregex/impl/PredefinedCharSets.scala b/src/main/scala/dregex/impl/PredefinedCharSets.scala
@@ -14,56 +14,6 @@ object PredefinedCharSets {
 
   private[this] val logger = LoggerFactory.getLogger(PredefinedCharSets.getClass)
 
-  val unicodeBlocks: Map[String, CharSet] = {
-    val ret = collection.mutable.Map[String, CharSet]()
-    for ((block, range) <- UnicodeDatabase.blockRanges.asScala) {
-      val charSet = CharSet.fromRange(CharRange(range.from, range.to))
-      ret.put(UnicodeDatabaseReader.canonicalizeBlockName(block), charSet)
-    }
-    for ((block, alias) <- UnicodeDatabase.blockSynonyms.asScala) {
-      ret.put(UnicodeDatabaseReader.canonicalizeBlockName(alias), ret(UnicodeDatabaseReader.canonicalizeBlockName(block)))
-    }
-    ret.toMap
-  }
-
-  val unicodeScripts: Map[String, CharSet] = {
-    val ret = collection.mutable.Map[String, CharSet]()
-    for ((block, ranges) <- UnicodeDatabase.scriptRanges.asScala) {
-      val chatSet = CharSet(ranges.asScala.toSeq.map(range => CharRange(range.from, range.to)))
-      ret.put(block.toUpperCase, chatSet)
-    }
-    for ((script, alias) <- UnicodeDatabase.scriptSynomyms.asScala) {
-      ret.put(alias.toUpperCase, ret(script.toUpperCase))
-    }
-    ret.toMap
-  }
-
-  val lower = CharSet.fromRange(CharRange(from = 'a', to = 'z'))
-  val upper = CharSet.fromRange(CharRange(from = 'A', to = 'Z'))
-  val alpha = CharSet.fromCharSets(lower, upper)
-  val digit = CharSet.fromRange(CharRange(from = '0', to = '9'))
-  val alnum = CharSet.fromCharSets(alpha, digit)
-  val punct = CharSet("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""".map(char => Lit(char)))
-  val graph = CharSet.fromCharSets(alnum, punct)
-  val space = CharSet(Seq(Lit('\n'), Lit('\t'), Lit('\r'), Lit('\f'), Lit(' '), Lit(0x0B)))
-  val wordChar = CharSet(alnum.ranges :+ Lit('_'))
-
-  val posixClasses = Map(
-    "Lower" -> lower,
-    "Upper" -> upper,
-    "ASCII" -> CharSet.fromRange(CharRange(from = 0, to = 0x7F)),
-    "Alpha" -> alpha,
-    "Digit" -> digit,
-    "Alnum" -> alnum,
-    "Punct" -> punct,
-    "Graph" -> graph,
-    "Print" -> CharSet(graph.ranges :+ Lit(0x20)),
-    "Blank" -> CharSet(Seq(Lit(0x20), Lit('\t'))),
-    "Cntrl" -> CharSet(Seq(CharRange(from = 0, to = 0x1F), Lit(0x7F))),
-    "XDigit" -> CharSet(digit.ranges ++ Seq(CharRange(from = 'a', to = 'f'), CharRange(from = 'A', to = 'F'))),
-    "Space" -> space
-  )
-
   // Unicode version of POSIX-defined character classes
 
   val unicodeDigit = unicodeBinaryProperties("DIGIT")
@@ -101,7 +51,7 @@ object PredefinedCharSets {
   val unicodePosixClasses = Map(
     "Lower" -> unicodeBinaryProperties("LOWERCASE"),
     "Upper" -> unicodeBinaryProperties("UPPERCASE"),
-    "ASCII" -> posixClasses("ASCII"),
+    "ASCII" -> PredefinedPosixCharSets.classes.get("ASCII"),
     "Alpha" -> unicodeBinaryProperties("ALPHABETIC"),
     "Digit" -> unicodeDigit,
     "Alnum" -> CharSet.fromCharSets(unicodeBinaryProperties("ALPHABETIC"), unicodeDigit),

diff --git a/src/main/scala/dregex/impl/PredefinedPosixCharSets.java b/src/main/scala/dregex/impl/PredefinedPosixCharSets.java
@@ -0,0 +1,44 @@
+package dregex.impl;
+
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
+
+import static dregex.impl.RegexTree.CharSet$;
+import static dregex.impl.RegexTree.CharSet;
+import static dregex.impl.RegexTree.Lit;
+import static dregex.impl.RegexTree.CharRange;
+
+public class PredefinedPosixCharSets {
+
+    public static CharSet lower = CharSet$.MODULE$.fromRange(new CharRange('a', 'z'));
+    public static CharSet upper = CharSet$.MODULE$.fromRange(new CharRange('A', 'Z'));
+    public static CharSet alpha = CharSet$.MODULE$.fromCharSetsJava(List.of(lower, upper));
+    public static CharSet digit = CharSet$.MODULE$.fromRange(new CharRange('0', '9'));
+    public static CharSet alnum = CharSet$.MODULE$.fromCharSetsJava(List.of(alpha, digit));
+    public static CharSet punct = CharSet$.MODULE$.fromJava(
+            "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().mapToObj(ch -> new Lit(ch)).collect(Collectors.toList()));
+    public static CharSet graph = CharSet$.MODULE$.fromCharSetsJava(List.of(alnum, punct));
+    public static CharSet space = CharSet$.MODULE$.fromJava(List.of(new Lit('\n'), new Lit('\t'),
+            new Lit('\r'), new Lit('\f'), new Lit(' '), new Lit(0x0B)));
+    public static CharSet wordChar = CharSet$.MODULE$.fromJava(
+            Stream.concat(alnum.javaRanges().stream(), Stream.of(new Lit('_'))).collect(Collectors.toList()));
+
+    public static Map<String, CharSet> classes = Map.ofEntries(
+            Map.entry("Lower", lower),
+            Map.entry("Upper", upper),
+            Map.entry("ASCII", CharSet$.MODULE$.fromRange(new CharRange(0, 0x7F))),
+            Map.entry("Alpha", alpha),
+            Map.entry("Digit", digit),
+            Map.entry("Alnum", alnum),
+            Map.entry("Punct", punct),
+            Map.entry("Graph", graph),
+            Map.entry("Print", CharSet$.MODULE$.fromJava(Stream.concat(graph.javaRanges().stream(),
+                    Stream.of(new Lit(0x20))).collect(Collectors.toList()))),
+            Map.entry("Blank", CharSet$.MODULE$.fromJava(List.of(new Lit(0x20), new Lit('\t')))),
+            Map.entry("Cntrl", CharSet$.MODULE$.fromJava(List.of(new CharRange(0, 0x1F), new Lit(0x7F)))),
+            Map.entry("XDigit", CharSet$.MODULE$.fromJava(Stream.concat(digit.javaRanges().stream(),
+                    Stream.of(new CharRange('a','f'), new CharRange('A', 'F'))).collect(Collectors.toList()))),
+            Map.entry("Space", space));
+}
diff --git a/src/main/scala/dregex/impl/RegexParser.scala b/src/main/scala/dregex/impl/RegexParser.scala
@@ -6,6 +6,7 @@ import dregex.{InvalidRegexException, ParsedRegex}
 import dregex.impl.RegexParser.DotMatch
 
 import scala.util.parsing.combinator.RegexParsers
+import scala.jdk.CollectionConverters._
 
 class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean) extends RegexParsers {
 
@@ -137,11 +138,17 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
     case propName ~ _ ~ propValue =>
       if (propName == "block" || propName == "blk") {
         val canonicalBlockName = UnicodeDatabaseReader.canonicalizeBlockName(propValue)
-        PredefinedCharSets.unicodeBlocks
-          .getOrElse(canonicalBlockName, throw new InvalidRegexException("Invalid Unicode block: " + propValue))
+        val block = UnicodeBlocks.unicodeBlocks.get(canonicalBlockName)
+        if (block == null) {
+          throw new InvalidRegexException("Invalid Unicode block: " + propValue)
+        }
+        block
       } else if (propName == "script" || propName == "sc") {
-        PredefinedCharSets.unicodeScripts
-          .getOrElse(propValue.toUpperCase(), throw new InvalidRegexException("Invalid Unicode script: " + propValue))
+        val script = UnicodeScripts.unicodeScripts.get(propValue.toUpperCase())
+        if (script == null) {
+          throw new InvalidRegexException("Invalid Unicode script: " + propValue)
+        }
+        script
       } else if (propName == "general_category" || propName == "gc") {
         PredefinedCharSets.unicodeGeneralCategories
           .getOrElse(propValue, throw new InvalidRegexException("Invalid Unicode general category: " + propValue))
@@ -155,7 +162,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
      * If the property starts with "Is" it could be either a script,
      * general category or a binary property. Look for all.
      */
-    PredefinedCharSets.unicodeScripts
+    UnicodeScripts.unicodeScripts.asScala
       .get(name.toUpperCase())
       .orElse(PredefinedCharSets.unicodeGeneralCategories.get(name))
       .orElse(PredefinedCharSets.unicodeBinaryProperties.get(name.toUpperCase()))
@@ -165,8 +172,11 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
   }
 
   def specialCharSetWithIn = backslash ~ "p" ~ "{" ~ "In" ~> unicodeSubsetName <~ "}" ^^ { blockName =>
-    PredefinedCharSets.unicodeBlocks
-      .getOrElse(UnicodeDatabaseReader.canonicalizeBlockName(blockName), throw new InvalidRegexException("Invalid Unicode block: " + blockName))
+    val block = UnicodeBlocks.unicodeBlocks.get(UnicodeDatabaseReader.canonicalizeBlockName(blockName))
+    if (block == null) {
+      throw new InvalidRegexException("Invalid Unicode block: " + blockName)
+    }
+    block
   }
 
   def specialCharSetWithJava = backslash ~ "p" ~ "{" ~ "java" ~> unicodeSubsetName <~ "}" ^^ { charClass =>
@@ -184,7 +194,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
       if (unicodeClasses) {
         PredefinedCharSets.unicodePosixClasses
       } else {
-        PredefinedCharSets.posixClasses
+        PredefinedPosixCharSets.classes.asScala
       }
     }
     effPosixClasses.get(name).orElse(PredefinedCharSets.unicodeGeneralCategories.get(name)).getOrElse {
@@ -228,42 +238,42 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
     if (unicodeClasses)
       PredefinedCharSets.unicodeDigit
     else
-      PredefinedCharSets.digit
+      PredefinedPosixCharSets.digit
   }
 
   def shorthandCharSetDigitCompl = backslash ~ "D" ^^^ {
     if (unicodeClasses)
       PredefinedCharSets.unicodeDigit.complement
     else
-      PredefinedCharSets.digit.complement
+      PredefinedPosixCharSets.digit.complement
   }
 
   def shorthandCharSetSpace = backslash ~ "s" ^^^ {
     if (unicodeClasses)
       PredefinedCharSets.unicodeSpace
     else
-      PredefinedCharSets.space
+      PredefinedPosixCharSets.space
   }
 
   def shorthandCharSetSpaceCompl = backslash ~ "S" ^^^ {
     if (unicodeClasses)
       PredefinedCharSets.unicodeSpace.complement
     else
-      PredefinedCharSets.space.complement
+      PredefinedPosixCharSets.space.complement
   }
 
   def shorthandCharSetWord = backslash ~ "w" ^^^ {
     if (unicodeClasses)
       PredefinedCharSets.unicodeWordChar
     else
-      PredefinedCharSets.wordChar
+      PredefinedPosixCharSets.wordChar
   }
 
   def shorthandCharSetWordCompl = backslash ~ "W" ^^^ {
     if (unicodeClasses)
       PredefinedCharSets.unicodeWordChar.complement
     else
-      PredefinedCharSets.wordChar.complement
+      PredefinedPosixCharSets.wordChar.complement
   }
 
   def charClass = "[" ~> "^".? ~ "-".? ~ charClassAtom.+ ~ "-".? <~ "]" ^^ {

diff --git a/src/main/scala/dregex/impl/RegexTree.scala b/src/main/scala/dregex/impl/RegexTree.scala
@@ -1,6 +1,7 @@
 package dregex.impl
 
 import scala.runtime.ScalaRunTime
+import scala.jdk.CollectionConverters._
 
 sealed trait Direction
 object Direction {
@@ -109,11 +110,19 @@ object RegexTree {
     def canonical = this
     def precedence = 1
     override def toString = s"${getClass.getSimpleName}(${ranges.mkString(",")})"
+    def javaRanges() = ranges.asJava
   }
 
   object CharSet {
     def fromCharSets(charSets: CharSet*): CharSet = CharSet(charSets.to(Seq).flatMap(_.ranges))
+
+    def fromCharSetsJava(charSets: java.util.List[CharSet]) = fromCharSets(charSets.asScala.toSeq: _*)
+
     def fromRange(interval: AbstractRange) = CharSet(Seq(interval))
+
+    def fromJava(ranges: java.util.List[AbstractRange]): CharSet = {
+      CharSet(ranges.asScala.toSeq)
+    }
   }
 
   case class Disj(values: Seq[Node]) extends ComplexPart {

diff --git a/src/main/scala/dregex/impl/UnicodeBlocks.java b/src/main/scala/dregex/impl/UnicodeBlocks.java
@@ -0,0 +1,42 @@
+package dregex.impl;
+
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.HashMap;
+import java.util.Map;
+
+public class UnicodeBlocks {
+
+    private static final Map<String, UnicodeDatabaseReader.Range> ranges;
+
+    static {
+        try (var blocksFile = UnicodeBlocks.class.getResourceAsStream("/Blocks.txt")) {
+            ranges = UnicodeDatabaseReader.getBlocks(new InputStreamReader(blocksFile));
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+    private static final Map<String, String> synonyms = Map.of(
+            "Greek and Coptic", "Greek"
+    );
+
+    public static final Map<String, RegexTree.CharSet> unicodeBlocks;
+
+    static {
+        unicodeBlocks = new HashMap<>();
+        for (var entry : ranges.entrySet()) {
+            var block = entry.getKey();
+            var range = entry.getValue();
+            var charSet = RegexTree.CharSet$.MODULE$.fromRange(new RegexTree.CharRange(range.from, range.to));
+            unicodeBlocks.put(UnicodeDatabaseReader.canonicalizeBlockName(block), charSet);
+        }
+        for (var entry : synonyms.entrySet()) {
+            var block = entry.getKey();
+            var alias = entry.getValue();
+            unicodeBlocks.put(UnicodeDatabaseReader.canonicalizeBlockName(alias),
+                    unicodeBlocks.get(UnicodeDatabaseReader.canonicalizeBlockName(block)));
+        }
+    }
+
+}
diff --git a/...in/scala/dregex/impl/UnicodeDatabase.java → ...ain/scala/dregex/impl/UnicodeScripts.java b/...in/scala/dregex/impl/UnicodeDatabase.java → ...ain/scala/dregex/impl/UnicodeScripts.java
@@ -2,36 +2,24 @@
 
 import java.io.IOException;
 import java.io.InputStreamReader;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
 
-public class UnicodeDatabase {
+public class UnicodeScripts {
 
-    public static Map<String, UnicodeDatabaseReader.Range> blockRanges;
+    private static final Map<String, List<UnicodeDatabaseReader.Range>> ranges;
 
     static {
-        try (var blocksFile = UnicodeDatabase.class.getResourceAsStream("/Blocks.txt")) {
-            blockRanges = UnicodeDatabaseReader.getBlocks(new InputStreamReader(blocksFile));
+        try (var scriptsFile = UnicodeScripts.class.getResourceAsStream("/Scripts.txt")) {
+            ranges = UnicodeDatabaseReader.getScripts(new InputStreamReader(scriptsFile));
         } catch (IOException e) {
             throw new RuntimeException(e);
         }
     }
 
-    public static Map<String, String> blockSynonyms = Map.of(
-            "Greek and Coptic", "Greek"
-    );
-
-    public static Map<String, List<UnicodeDatabaseReader.Range>> scriptRanges;
-
-    static {
-        try (var scriptsFile = UnicodeDatabase.class.getResourceAsStream("/Scripts.txt")) {
-            scriptRanges = UnicodeDatabaseReader.getScripts(new InputStreamReader(scriptsFile));
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-    }
-
-    public static Map<String, String> scriptSynomyms = Map.ofEntries(
+    private static final Map<String, String> synomyms = Map.ofEntries(
             Map.entry("COMMON", "ZYYY"),
             Map.entry("LATIN", "LATN"),
             Map.entry("GREEK", "GREK"),
@@ -189,4 +177,23 @@ public class UnicodeDatabase {
             Map.entry("DIVES_AKURU", "DIAK"),
             Map.entry("KHITAN_SMALL_SCRIPT", "KITS"));
 
+
+    public static final Map<String, RegexTree.CharSet> unicodeScripts;
+
+    static {
+        unicodeScripts = new HashMap<>();
+        for (var entry : ranges.entrySet()) {
+            var block = entry.getKey();
+            var ranges = entry.getValue();
+            var chatSet = RegexTree.CharSet$.MODULE$.fromJava(ranges.stream().map(
+                    range -> new RegexTree.CharRange(range.from, range.to)).collect(Collectors.toList()));
+            unicodeScripts.put(block.toUpperCase(), chatSet);
+        }
+        for (var entry : synomyms.entrySet()) {
+            var script = entry.getKey();
+            var alias = entry.getValue();
+            unicodeScripts.put(alias.toUpperCase(), unicodeScripts.get(script.toUpperCase()));
+        }
+    }
+
 }
diff --git a/src/test/scala/dregex/TreeGenerator.scala b/src/test/scala/dregex/TreeGenerator.scala
@@ -1,7 +1,7 @@
 package dregex
 
 import dregex.impl.RegexTree
-import dregex.impl.PredefinedCharSets
+import dregex.impl.PredefinedCharSets2
 
 /**
   * Generate some sample regex trees, useful for testing.
@@ -17,7 +17,7 @@ class TreeGenerator {
         Wildcard,
         CharSet.fromRange(CharRange('d', 'f')),
         CharSet.fromRange(CharRange('d', 'f')).complement,
-        PredefinedCharSets.digit)
+        PredefinedCharSets2.digit)
     } else {
       generateFixedDepth(levels - 1).flatMap { node =>
         val simple = Iterator(