Inline UnicodeChar

marianobarrios · Mar 26, 2023 · 8cd9867 · 8cd9867
1 parent a238b07
commit 8cd9867
Show file tree

Hide file tree

Showing 11 changed files with 102 additions and 186 deletions.
diff --git a/src/main/scala/dregex/impl/CharInterval.java b/src/main/scala/dregex/impl/CharInterval.java
@@ -6,18 +6,11 @@
 
 public final class CharInterval implements AtomPart, Ordered<CharInterval> {
 
-    public final UnicodeChar from;
-    public final UnicodeChar to;
+    public final int from;
+    public final int to;
 
-    public CharInterval(UnicodeChar from, UnicodeChar to) {
-        if (from == null) {
-            throw new NullPointerException("from is null");
-        }
-        if (to == null) {
-            throw new NullPointerException("to is null");
-        }
-
-        if (from.compare(to) > 0) {
+    public CharInterval(int from, int to) {
+        if (from > to) {
             throw new IllegalArgumentException("from value cannot be larger than to");
         }
         this.from = from;
@@ -29,7 +22,7 @@ public boolean equals(Object o) {
         if (this == o) return true;
         if (o == null || getClass() != o.getClass()) return false;
         CharInterval that = (CharInterval) o;
-        return Objects.equals(from, that.from) && Objects.equals(to, that.to);
+        return from == that.from && to == that.to;
     }
 
     @Override
@@ -39,28 +32,28 @@ public int hashCode() {
 
     @Override
     public int compare(CharInterval that) {
-        return this.from.compare(that.from);
+        return Integer.compare(from, that.from);
     }
 
     public String toString() {
-        if (from.equals(to)) {
-            return from.toString();
+        if (from == to) {
+            return Integer.toString(from);
         } else {
             return String.format("[%s-%s]", from, to);
         }
     }
 
     public static Map<RegexTree.AbstractRange, List<CharInterval>> calculateNonOverlapping(List<RegexTree.AbstractRange> ranges) {
-        Set<UnicodeChar> startSet = new HashSet<>();
-        Set<UnicodeChar> endSet = new HashSet<>();
+        Set<Integer> startSet = new HashSet<>();
+        Set<Integer> endSet = new HashSet<>();
         for (var range : ranges) {
             startSet.add(range.from());
-            if (range.from().compare(UnicodeChar.min()) > 0) {
-                endSet.add(range.from().$minus(1));
+            if (range.from() > Character.MIN_CODE_POINT) {
+                endSet.add(range.from() - 1);
             }
             endSet.add(range.to());
-            if (range.to().compare(UnicodeChar.max()) < 0) {
-                startSet.add(range.to().$plus(1));
+            if (range.to() < Character.MAX_CODE_POINT) {
+                startSet.add(range.to() + 1);
             }
         }
         Map<RegexTree.AbstractRange, List<CharInterval>> ret = new HashMap<>();

diff --git a/src/main/scala/dregex/impl/DfaAlgorithms.scala b/src/main/scala/dregex/impl/DfaAlgorithms.scala
@@ -280,12 +280,11 @@ object DfaAlgorithms {
     var current = dfa.initial
     var i = 0
     for (codePoint <- string.codePoints.iterator.asScala) {
-      val char = UnicodeChar(codePoint)
       val currentTrans = dfa.defTransitions.getOrElse(current, SortedMap[CharInterval, A]())
       // O(log transitions) search in the range tree
-      val newState = Util.floorEntry(currentTrans, new CharInterval(char, char)).flatMap {
+      val newState = Util.floorEntry(currentTrans, new CharInterval(codePoint, codePoint)).flatMap {
         case (interval, state) =>
-          if (interval.to >= char) {
+          if (interval.to >= codePoint) {
             Some(state)
           } else {
             None

diff --git a/src/main/scala/dregex/impl/PredefinedCharSets.scala b/src/main/scala/dregex/impl/PredefinedCharSets.scala
@@ -2,8 +2,6 @@ package dregex.impl
 
 import dregex.impl.RegexTree.AbstractRange
 import dregex.impl.RegexTree.Lit
-import dregex.impl.UnicodeChar.FromCharConversion
-import dregex.impl.UnicodeChar.FromIntConversion
 import dregex.impl.RegexTree.CharSet
 import dregex.impl.RegexTree.CharRange
 
@@ -19,7 +17,7 @@ object PredefinedCharSets {
   val unicodeBlocks: Map[String, CharSet] = {
     val ret = collection.mutable.Map[String, CharSet]()
     for ((block, range) <- UnicodeDatabase.blockRanges.asScala) {
-      val charSet = CharSet.fromRange(CharRange(range.from.u, range.to.u))
+      val charSet = CharSet.fromRange(CharRange(range.from, range.to))
       ret.put(UnicodeDatabaseReader.canonicalizeBlockName(block), charSet)
     }
     for ((block, alias) <- UnicodeDatabase.blockSynonyms.asScala) {
@@ -31,7 +29,7 @@ object PredefinedCharSets {
   val unicodeScripts: Map[String, CharSet] = {
     val ret = collection.mutable.Map[String, CharSet]()
     for ((block, ranges) <- UnicodeDatabase.scriptRanges.asScala) {
-      val chatSet = CharSet(ranges.asScala.toSeq.map(range => CharRange(range.from.u, range.to.u)))
+      val chatSet = CharSet(ranges.asScala.toSeq.map(range => CharRange(range.from, range.to)))
       ret.put(block.toUpperCase, chatSet)
     }
     for ((script, alias) <- UnicodeDatabase.scriptSynomyms.asScala) {
@@ -40,29 +38,29 @@ object PredefinedCharSets {
     ret.toMap
   }
 
-  val lower = CharSet.fromRange(CharRange(from = 'a'.u, to = 'z'.u))
-  val upper = CharSet.fromRange(CharRange(from = 'A'.u, to = 'Z'.u))
+  val lower = CharSet.fromRange(CharRange(from = 'a', to = 'z'))
+  val upper = CharSet.fromRange(CharRange(from = 'A', to = 'Z'))
   val alpha = CharSet.fromCharSets(lower, upper)
-  val digit = CharSet.fromRange(CharRange(from = '0'.u, to = '9'.u))
+  val digit = CharSet.fromRange(CharRange(from = '0', to = '9'))
   val alnum = CharSet.fromCharSets(alpha, digit)
-  val punct = CharSet("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""".map(char => Lit(UnicodeChar(char))))
+  val punct = CharSet("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""".map(char => Lit(char)))
   val graph = CharSet.fromCharSets(alnum, punct)
-  val space = CharSet(Seq(Lit('\n'.u), Lit('\t'.u), Lit('\r'.u), Lit('\f'.u), Lit(' '.u), Lit(0x0B.u)))
-  val wordChar = CharSet(alnum.ranges :+ Lit('_'.u))
+  val space = CharSet(Seq(Lit('\n'), Lit('\t'), Lit('\r'), Lit('\f'), Lit(' '), Lit(0x0B)))
+  val wordChar = CharSet(alnum.ranges :+ Lit('_'))
 
   val posixClasses = Map(
     "Lower" -> lower,
     "Upper" -> upper,
-    "ASCII" -> CharSet.fromRange(CharRange(from = 0.u, to = 0x7F.u)),
+    "ASCII" -> CharSet.fromRange(CharRange(from = 0, to = 0x7F)),
     "Alpha" -> alpha,
     "Digit" -> digit,
     "Alnum" -> alnum,
     "Punct" -> punct,
     "Graph" -> graph,
-    "Print" -> CharSet(graph.ranges :+ Lit(0x20.u)),
-    "Blank" -> CharSet(Seq(Lit(0x20.u), Lit('\t'.u))),
-    "Cntrl" -> CharSet(Seq(CharRange(from = 0.u, to = 0x1F.u), Lit(0x7F.u))),
-    "XDigit" -> CharSet(digit.ranges ++ Seq(CharRange(from = 'a'.u, to = 'f'.u), CharRange(from = 'A'.u, to = 'F'.u))),
+    "Print" -> CharSet(graph.ranges :+ Lit(0x20)),
+    "Blank" -> CharSet(Seq(Lit(0x20), Lit('\t'))),
+    "Cntrl" -> CharSet(Seq(CharRange(from = 0, to = 0x1F), Lit(0x7F))),
+    "XDigit" -> CharSet(digit.ranges ++ Seq(CharRange(from = 'a', to = 'f'), CharRange(from = 'A', to = 'F'))),
     "Space" -> space
   )
 
@@ -86,7 +84,7 @@ object PredefinedCharSets {
       unicodeSpace.ranges,
       unicodeGeneralCategories("Zl").ranges ++
         unicodeGeneralCategories("Zp").ranges ++
-        Seq(CharRange(from = '\u000a'.u, to = '\u000d'.u)) ++ Seq(Lit('\u0085'.u))
+        Seq(CharRange(from = '\u000a', to = '\u000d')) ++ Seq(Lit('\u0085'))
     ))
 
   val unicodeWordChar = CharSet.fromCharSets(
@@ -127,8 +125,8 @@ object PredefinedCharSets {
 
   lazy val allUnicodeLit: Seq[Lit] = {
     val (ret, elapsed) = Util.time {
-      for (codePoint <- UnicodeChar.min.codePoint to UnicodeChar.max.codePoint) yield {
-        Lit(codePoint.u)
+      for (codePoint <- Character.MIN_CODE_POINT to Character.MAX_CODE_POINT) yield {
+        Lit(codePoint)
       }
     }
     logger.debug(s"initialized ${ret.size} Unicode literals in $elapsed")
@@ -139,7 +137,7 @@ object PredefinedCharSets {
     val (ret, elapsed) = Util.time {
       val builder = collection.mutable.Map[String, ArrayBuffer[AbstractRange]]()
       for (lit <- allUnicodeLit) {
-        val categoryJavaId = Character.getType(lit.char.codePoint).toByte
+        val categoryJavaId = Character.getType(lit.codePoint).toByte
         val category = GeneralCategory.categories(categoryJavaId)
         builder.getOrElseUpdate(category, ArrayBuffer()) += lit
         val parentCategory = category.substring(0, 1) // first letter
@@ -158,7 +156,7 @@ object PredefinedCharSets {
       for {
         lit <- allUnicodeLit
         (prop, fn) <- GeneralCategory.binaryProperties
-        if fn(lit.char.codePoint)
+        if fn(lit.codePoint)
       } {
         builder.getOrElseUpdate(prop, ArrayBuffer()) += lit
       }
@@ -172,7 +170,7 @@ object PredefinedCharSets {
     val (ret, elapsed) = Util.time {
       val builder = collection.mutable.Map[String, ArrayBuffer[AbstractRange]]()
       for (lit <- allUnicodeLit) {
-        for ((prop, fn) <- JavaCharacterProperties.properties if fn(lit.char.codePoint)) {
+        for ((prop, fn) <- JavaCharacterProperties.properties if fn(lit.codePoint)) {
           builder.getOrElseUpdate(prop, ArrayBuffer()) += lit
         }
       }

diff --git a/src/main/scala/dregex/impl/RegexParser.scala b/src/main/scala/dregex/impl/RegexParser.scala
@@ -4,7 +4,6 @@ import java.util.regex.Pattern
 
 import dregex.{InvalidRegexException, ParsedRegex}
 import dregex.impl.RegexParser.DotMatch
-import dregex.impl.UnicodeChar.FromCharConversion
 
 import scala.util.parsing.combinator.RegexParsers
 
@@ -70,39 +69,39 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
   // Parsers that return a literal Node
 
   def specialEscape = backslash ~> "[^dwsDWSuxcpR0123456789]".r ^^ {
-    case "n" => Lit('\n'.u)
-    case "r" => Lit('\r'.u)
-    case "t" => Lit('\t'.u)
-    case "f" => Lit('\f'.u)
-    case "b" => Lit('\b'.u)
-    case "v" => Lit('\u000B'.u) // vertical tab
-    case "a" => Lit('\u0007'.u) // bell
-    case "e" => Lit('\u001B'.u) // escape
-    case "B" => Lit('\\'.u)
-    case c   => Lit(UnicodeChar.fromSingletonString(c)) // remaining escaped characters stand for themselves
+    case "n" => Lit('\n')
+    case "r" => Lit('\r')
+    case "t" => Lit('\t')
+    case "f" => Lit('\f')
+    case "b" => Lit('\b')
+    case "v" => Lit('\u000B') // vertical tab
+    case "a" => Lit('\u0007') // bell
+    case "e" => Lit('\u001B') // escape
+    case "B" => Lit('\\')
+    case c   => Lit.fromSingletonString(c) // remaining escaped characters stand for themselves
   }
 
   def doubleUnicodeEscape = backslash ~ "u" ~ hexNumber(4) ~ backslash ~ "u" ~ hexNumber(4) ^? {
     case _ ~ _ ~ highNumber ~ _ ~ _ ~ lowNumber
         if Character.isHighSurrogate(highNumber.toChar) && Character.isLowSurrogate(lowNumber.toChar) =>
       val codePoint = Character.toCodePoint(highNumber.toChar, lowNumber.toChar)
-      Lit(UnicodeChar(codePoint))
+      Lit(codePoint)
   }
 
   def unicodeEscape = backslash ~ "u" ~> hexNumber(4) ^^ { codePoint =>
-    Lit(UnicodeChar(codePoint))
+    Lit(codePoint)
   }
 
   def hexEscape = backslash ~ "x" ~> hexNumber(2) ^^ { codePoint =>
-    Lit(UnicodeChar(codePoint))
+    Lit(codePoint)
   }
 
   def longHexEscape = backslash ~ "x" ~ "{" ~> hexNumber <~ "}" ^^ { codePoint =>
-    Lit(UnicodeChar(codePoint))
+    Lit(codePoint)
   }
 
   def octalEscape = backslash ~ "0" ~> (octalNumber(1) ||| octalNumber(2) ||| octalNumber(3)) ^^ { codePoint =>
-    Lit(UnicodeChar(codePoint))
+    Lit(codePoint)
   }
 
   /**
@@ -118,7 +117,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
       controlEscape |
       backReference
 
-  def anythingExcept(parser: Parser[_]) = not(parser) ~> (".".r ^^ (x => Lit(UnicodeChar.fromSingletonString(x))))
+  def anythingExcept(parser: Parser[_]) = not(parser) ~> (".".r ^^ (x => Lit.fromSingletonString(x)))
 
   def charLit = anchor | anythingExcept(charSpecial) | anyEscape
 
@@ -129,7 +128,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
   def singleCharacterClassLit = characterClassLit ^^ (lit => CharSet(Seq(lit)))
 
   def charClassRange = characterClassLit ~ "-" ~ characterClassLit ^^ {
-    case start ~ _ ~ end => CharSet.fromRange(CharRange(start.char, end.char))
+    case start ~ _ ~ end => CharSet.fromRange(CharRange(start.codePoint, end.codePoint))
   }
 
   private val unicodeSubsetName = "[0-9a-zA-Z_ -]+".r
@@ -209,7 +208,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
   // There is the special case of a character class with only one character: the dash. This is valid, but
   // not easily parsed by the general constructs.
   def dashClass = "[" ~> "^".? <~ "-" ~ "]" ^^ { negated =>
-    val set = CharSet.fromRange(Lit('-'.u))
+    val set = CharSet.fromRange(Lit('-'))
     if (negated.isDefined) {
       set.complement
     } else {
@@ -271,7 +270,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
     case negated ~ leftDash ~ charClass ~ rightDash =>
       val chars =
         if (leftDash.isDefined || rightDash.isDefined)
-          charClass :+ CharSet.fromRange(Lit('-'.u))
+          charClass :+ CharSet.fromRange(Lit('-'))
         else
           charClass
       val set = CharSet.fromCharSets(chars: _*)
@@ -289,17 +288,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
   }
 
   def unicodeLineBreak = backslash ~ "R" ^^^ {
-    Disj(
-      Seq(
-        Juxt(Seq(Lit('\u000D'.u), Lit('\u000A'.u))),
-        Lit('\u000A'.u),
-        Lit('\u000B'.u),
-        Lit('\u000C'.u),
-        Lit('\u000D'.u),
-        Lit('\u0085'.u),
-        Lit('\u2028'.u),
-        Lit('\u2029'.u)
-      ))
+    Disj(Seq(Juxt(Seq(Lit('\u000D'), Lit('\u000A'))), Lit('\u000A'), Lit('\u000B'), Lit('\u000C'), Lit('\u000D'), Lit('\u0085'), Lit('\u2028'), Lit('\u2029')))
   }
 
   def group = "(" ~> ("?" ~ "<".? ~ "[:=!]".r).? ~ sp ~ regex <~ sp ~ ")" ^^ {
@@ -327,16 +316,9 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
       case DotMatch.All =>
         Wildcard
       case DotMatch.JavaLines =>
-        CharSet(
-          Seq(
-            Lit('\n'.u),
-            Lit('\r'.u),
-            Lit('\u0085'.u),
-            Lit('\u2028'.u),
-            Lit('\u2829'.u)
-          )).complement
+        CharSet(Seq(Lit('\n'), Lit('\r'), Lit('\u0085'), Lit('\u2028'), Lit('\u2829'))).complement
       case DotMatch.UnixLines =>
-        CharSet.fromRange(Lit('\n'.u)).complement
+        CharSet.fromRange(Lit('\n')).complement
     }
   }
 
@@ -463,7 +445,7 @@ object RegexParser {
     */
   private def parseLiteralRegex(regex: String): ParsedRegex = {
     val literals: Seq[RegexTree.Lit] = regex.map { char =>
-      RegexTree.Lit(UnicodeChar.fromChar(char))
+      RegexTree.Lit(char)
     }
     new ParsedRegex(regex, RegexTree.Juxt(literals), Normalization.NoNormalization)
   }