Skip to content

Commit

Permalink
Inline UnicodeChar
Browse files Browse the repository at this point in the history
  • Loading branch information
marianobarrios committed Mar 26, 2023
1 parent a238b07 commit 8cd9867
Show file tree
Hide file tree
Showing 11 changed files with 102 additions and 186 deletions.
35 changes: 14 additions & 21 deletions src/main/scala/dregex/impl/CharInterval.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,11 @@

public final class CharInterval implements AtomPart, Ordered<CharInterval> {

public final UnicodeChar from;
public final UnicodeChar to;
public final int from;
public final int to;

public CharInterval(UnicodeChar from, UnicodeChar to) {
if (from == null) {
throw new NullPointerException("from is null");
}
if (to == null) {
throw new NullPointerException("to is null");
}

if (from.compare(to) > 0) {
public CharInterval(int from, int to) {
if (from > to) {
throw new IllegalArgumentException("from value cannot be larger than to");
}
this.from = from;
Expand All @@ -29,7 +22,7 @@ public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
CharInterval that = (CharInterval) o;
return Objects.equals(from, that.from) && Objects.equals(to, that.to);
return from == that.from && to == that.to;
}

@Override
Expand All @@ -39,28 +32,28 @@ public int hashCode() {

@Override
public int compare(CharInterval that) {
return this.from.compare(that.from);
return Integer.compare(from, that.from);
}

public String toString() {
if (from.equals(to)) {
return from.toString();
if (from == to) {
return Integer.toString(from);
} else {
return String.format("[%s-%s]", from, to);
}
}

public static Map<RegexTree.AbstractRange, List<CharInterval>> calculateNonOverlapping(List<RegexTree.AbstractRange> ranges) {
Set<UnicodeChar> startSet = new HashSet<>();
Set<UnicodeChar> endSet = new HashSet<>();
Set<Integer> startSet = new HashSet<>();
Set<Integer> endSet = new HashSet<>();
for (var range : ranges) {
startSet.add(range.from());
if (range.from().compare(UnicodeChar.min()) > 0) {
endSet.add(range.from().$minus(1));
if (range.from() > Character.MIN_CODE_POINT) {
endSet.add(range.from() - 1);
}
endSet.add(range.to());
if (range.to().compare(UnicodeChar.max()) < 0) {
startSet.add(range.to().$plus(1));
if (range.to() < Character.MAX_CODE_POINT) {
startSet.add(range.to() + 1);
}
}
Map<RegexTree.AbstractRange, List<CharInterval>> ret = new HashMap<>();
Expand Down
5 changes: 2 additions & 3 deletions src/main/scala/dregex/impl/DfaAlgorithms.scala
Original file line number Diff line number Diff line change
Expand Up @@ -280,12 +280,11 @@ object DfaAlgorithms {
var current = dfa.initial
var i = 0
for (codePoint <- string.codePoints.iterator.asScala) {
val char = UnicodeChar(codePoint)
val currentTrans = dfa.defTransitions.getOrElse(current, SortedMap[CharInterval, A]())
// O(log transitions) search in the range tree
val newState = Util.floorEntry(currentTrans, new CharInterval(char, char)).flatMap {
val newState = Util.floorEntry(currentTrans, new CharInterval(codePoint, codePoint)).flatMap {
case (interval, state) =>
if (interval.to >= char) {
if (interval.to >= codePoint) {
Some(state)
} else {
None
Expand Down
40 changes: 19 additions & 21 deletions src/main/scala/dregex/impl/PredefinedCharSets.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ package dregex.impl

import dregex.impl.RegexTree.AbstractRange
import dregex.impl.RegexTree.Lit
import dregex.impl.UnicodeChar.FromCharConversion
import dregex.impl.UnicodeChar.FromIntConversion
import dregex.impl.RegexTree.CharSet
import dregex.impl.RegexTree.CharRange

Expand All @@ -19,7 +17,7 @@ object PredefinedCharSets {
val unicodeBlocks: Map[String, CharSet] = {
val ret = collection.mutable.Map[String, CharSet]()
for ((block, range) <- UnicodeDatabase.blockRanges.asScala) {
val charSet = CharSet.fromRange(CharRange(range.from.u, range.to.u))
val charSet = CharSet.fromRange(CharRange(range.from, range.to))
ret.put(UnicodeDatabaseReader.canonicalizeBlockName(block), charSet)
}
for ((block, alias) <- UnicodeDatabase.blockSynonyms.asScala) {
Expand All @@ -31,7 +29,7 @@ object PredefinedCharSets {
val unicodeScripts: Map[String, CharSet] = {
val ret = collection.mutable.Map[String, CharSet]()
for ((block, ranges) <- UnicodeDatabase.scriptRanges.asScala) {
val chatSet = CharSet(ranges.asScala.toSeq.map(range => CharRange(range.from.u, range.to.u)))
val chatSet = CharSet(ranges.asScala.toSeq.map(range => CharRange(range.from, range.to)))
ret.put(block.toUpperCase, chatSet)
}
for ((script, alias) <- UnicodeDatabase.scriptSynomyms.asScala) {
Expand All @@ -40,29 +38,29 @@ object PredefinedCharSets {
ret.toMap
}

val lower = CharSet.fromRange(CharRange(from = 'a'.u, to = 'z'.u))
val upper = CharSet.fromRange(CharRange(from = 'A'.u, to = 'Z'.u))
val lower = CharSet.fromRange(CharRange(from = 'a', to = 'z'))
val upper = CharSet.fromRange(CharRange(from = 'A', to = 'Z'))
val alpha = CharSet.fromCharSets(lower, upper)
val digit = CharSet.fromRange(CharRange(from = '0'.u, to = '9'.u))
val digit = CharSet.fromRange(CharRange(from = '0', to = '9'))
val alnum = CharSet.fromCharSets(alpha, digit)
val punct = CharSet("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""".map(char => Lit(UnicodeChar(char))))
val punct = CharSet("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""".map(char => Lit(char)))
val graph = CharSet.fromCharSets(alnum, punct)
val space = CharSet(Seq(Lit('\n'.u), Lit('\t'.u), Lit('\r'.u), Lit('\f'.u), Lit(' '.u), Lit(0x0B.u)))
val wordChar = CharSet(alnum.ranges :+ Lit('_'.u))
val space = CharSet(Seq(Lit('\n'), Lit('\t'), Lit('\r'), Lit('\f'), Lit(' '), Lit(0x0B)))
val wordChar = CharSet(alnum.ranges :+ Lit('_'))

val posixClasses = Map(
"Lower" -> lower,
"Upper" -> upper,
"ASCII" -> CharSet.fromRange(CharRange(from = 0.u, to = 0x7F.u)),
"ASCII" -> CharSet.fromRange(CharRange(from = 0, to = 0x7F)),
"Alpha" -> alpha,
"Digit" -> digit,
"Alnum" -> alnum,
"Punct" -> punct,
"Graph" -> graph,
"Print" -> CharSet(graph.ranges :+ Lit(0x20.u)),
"Blank" -> CharSet(Seq(Lit(0x20.u), Lit('\t'.u))),
"Cntrl" -> CharSet(Seq(CharRange(from = 0.u, to = 0x1F.u), Lit(0x7F.u))),
"XDigit" -> CharSet(digit.ranges ++ Seq(CharRange(from = 'a'.u, to = 'f'.u), CharRange(from = 'A'.u, to = 'F'.u))),
"Print" -> CharSet(graph.ranges :+ Lit(0x20)),
"Blank" -> CharSet(Seq(Lit(0x20), Lit('\t'))),
"Cntrl" -> CharSet(Seq(CharRange(from = 0, to = 0x1F), Lit(0x7F))),
"XDigit" -> CharSet(digit.ranges ++ Seq(CharRange(from = 'a', to = 'f'), CharRange(from = 'A', to = 'F'))),
"Space" -> space
)

Expand All @@ -86,7 +84,7 @@ object PredefinedCharSets {
unicodeSpace.ranges,
unicodeGeneralCategories("Zl").ranges ++
unicodeGeneralCategories("Zp").ranges ++
Seq(CharRange(from = '\u000a'.u, to = '\u000d'.u)) ++ Seq(Lit('\u0085'.u))
Seq(CharRange(from = '\u000a', to = '\u000d')) ++ Seq(Lit('\u0085'))
))

val unicodeWordChar = CharSet.fromCharSets(
Expand Down Expand Up @@ -127,8 +125,8 @@ object PredefinedCharSets {

lazy val allUnicodeLit: Seq[Lit] = {
val (ret, elapsed) = Util.time {
for (codePoint <- UnicodeChar.min.codePoint to UnicodeChar.max.codePoint) yield {
Lit(codePoint.u)
for (codePoint <- Character.MIN_CODE_POINT to Character.MAX_CODE_POINT) yield {
Lit(codePoint)
}
}
logger.debug(s"initialized ${ret.size} Unicode literals in $elapsed")
Expand All @@ -139,7 +137,7 @@ object PredefinedCharSets {
val (ret, elapsed) = Util.time {
val builder = collection.mutable.Map[String, ArrayBuffer[AbstractRange]]()
for (lit <- allUnicodeLit) {
val categoryJavaId = Character.getType(lit.char.codePoint).toByte
val categoryJavaId = Character.getType(lit.codePoint).toByte
val category = GeneralCategory.categories(categoryJavaId)
builder.getOrElseUpdate(category, ArrayBuffer()) += lit
val parentCategory = category.substring(0, 1) // first letter
Expand All @@ -158,7 +156,7 @@ object PredefinedCharSets {
for {
lit <- allUnicodeLit
(prop, fn) <- GeneralCategory.binaryProperties
if fn(lit.char.codePoint)
if fn(lit.codePoint)
} {
builder.getOrElseUpdate(prop, ArrayBuffer()) += lit
}
Expand All @@ -172,7 +170,7 @@ object PredefinedCharSets {
val (ret, elapsed) = Util.time {
val builder = collection.mutable.Map[String, ArrayBuffer[AbstractRange]]()
for (lit <- allUnicodeLit) {
for ((prop, fn) <- JavaCharacterProperties.properties if fn(lit.char.codePoint)) {
for ((prop, fn) <- JavaCharacterProperties.properties if fn(lit.codePoint)) {
builder.getOrElseUpdate(prop, ArrayBuffer()) += lit
}
}
Expand Down
64 changes: 23 additions & 41 deletions src/main/scala/dregex/impl/RegexParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import java.util.regex.Pattern

import dregex.{InvalidRegexException, ParsedRegex}
import dregex.impl.RegexParser.DotMatch
import dregex.impl.UnicodeChar.FromCharConversion

import scala.util.parsing.combinator.RegexParsers

Expand Down Expand Up @@ -70,39 +69,39 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
// Parsers that return a literal Node

def specialEscape = backslash ~> "[^dwsDWSuxcpR0123456789]".r ^^ {
case "n" => Lit('\n'.u)
case "r" => Lit('\r'.u)
case "t" => Lit('\t'.u)
case "f" => Lit('\f'.u)
case "b" => Lit('\b'.u)
case "v" => Lit('\u000B'.u) // vertical tab
case "a" => Lit('\u0007'.u) // bell
case "e" => Lit('\u001B'.u) // escape
case "B" => Lit('\\'.u)
case c => Lit(UnicodeChar.fromSingletonString(c)) // remaining escaped characters stand for themselves
case "n" => Lit('\n')
case "r" => Lit('\r')
case "t" => Lit('\t')
case "f" => Lit('\f')
case "b" => Lit('\b')
case "v" => Lit('\u000B') // vertical tab
case "a" => Lit('\u0007') // bell
case "e" => Lit('\u001B') // escape
case "B" => Lit('\\')
case c => Lit.fromSingletonString(c) // remaining escaped characters stand for themselves
}

def doubleUnicodeEscape = backslash ~ "u" ~ hexNumber(4) ~ backslash ~ "u" ~ hexNumber(4) ^? {
case _ ~ _ ~ highNumber ~ _ ~ _ ~ lowNumber
if Character.isHighSurrogate(highNumber.toChar) && Character.isLowSurrogate(lowNumber.toChar) =>
val codePoint = Character.toCodePoint(highNumber.toChar, lowNumber.toChar)
Lit(UnicodeChar(codePoint))
Lit(codePoint)
}

def unicodeEscape = backslash ~ "u" ~> hexNumber(4) ^^ { codePoint =>
Lit(UnicodeChar(codePoint))
Lit(codePoint)
}

def hexEscape = backslash ~ "x" ~> hexNumber(2) ^^ { codePoint =>
Lit(UnicodeChar(codePoint))
Lit(codePoint)
}

def longHexEscape = backslash ~ "x" ~ "{" ~> hexNumber <~ "}" ^^ { codePoint =>
Lit(UnicodeChar(codePoint))
Lit(codePoint)
}

def octalEscape = backslash ~ "0" ~> (octalNumber(1) ||| octalNumber(2) ||| octalNumber(3)) ^^ { codePoint =>
Lit(UnicodeChar(codePoint))
Lit(codePoint)
}

/**
Expand All @@ -118,7 +117,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
controlEscape |
backReference

def anythingExcept(parser: Parser[_]) = not(parser) ~> (".".r ^^ (x => Lit(UnicodeChar.fromSingletonString(x))))
def anythingExcept(parser: Parser[_]) = not(parser) ~> (".".r ^^ (x => Lit.fromSingletonString(x)))

def charLit = anchor | anythingExcept(charSpecial) | anyEscape

Expand All @@ -129,7 +128,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
def singleCharacterClassLit = characterClassLit ^^ (lit => CharSet(Seq(lit)))

def charClassRange = characterClassLit ~ "-" ~ characterClassLit ^^ {
case start ~ _ ~ end => CharSet.fromRange(CharRange(start.char, end.char))
case start ~ _ ~ end => CharSet.fromRange(CharRange(start.codePoint, end.codePoint))
}

private val unicodeSubsetName = "[0-9a-zA-Z_ -]+".r
Expand Down Expand Up @@ -209,7 +208,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
// There is the special case of a character class with only one character: the dash. This is valid, but
// not easily parsed by the general constructs.
def dashClass = "[" ~> "^".? <~ "-" ~ "]" ^^ { negated =>
val set = CharSet.fromRange(Lit('-'.u))
val set = CharSet.fromRange(Lit('-'))
if (negated.isDefined) {
set.complement
} else {
Expand Down Expand Up @@ -271,7 +270,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
case negated ~ leftDash ~ charClass ~ rightDash =>
val chars =
if (leftDash.isDefined || rightDash.isDefined)
charClass :+ CharSet.fromRange(Lit('-'.u))
charClass :+ CharSet.fromRange(Lit('-'))
else
charClass
val set = CharSet.fromCharSets(chars: _*)
Expand All @@ -289,17 +288,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
}

def unicodeLineBreak = backslash ~ "R" ^^^ {
Disj(
Seq(
Juxt(Seq(Lit('\u000D'.u), Lit('\u000A'.u))),
Lit('\u000A'.u),
Lit('\u000B'.u),
Lit('\u000C'.u),
Lit('\u000D'.u),
Lit('\u0085'.u),
Lit('\u2028'.u),
Lit('\u2029'.u)
))
Disj(Seq(Juxt(Seq(Lit('\u000D'), Lit('\u000A'))), Lit('\u000A'), Lit('\u000B'), Lit('\u000C'), Lit('\u000D'), Lit('\u0085'), Lit('\u2028'), Lit('\u2029')))
}

def group = "(" ~> ("?" ~ "<".? ~ "[:=!]".r).? ~ sp ~ regex <~ sp ~ ")" ^^ {
Expand Down Expand Up @@ -327,16 +316,9 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
case DotMatch.All =>
Wildcard
case DotMatch.JavaLines =>
CharSet(
Seq(
Lit('\n'.u),
Lit('\r'.u),
Lit('\u0085'.u),
Lit('\u2028'.u),
Lit('\u2829'.u)
)).complement
CharSet(Seq(Lit('\n'), Lit('\r'), Lit('\u0085'), Lit('\u2028'), Lit('\u2829'))).complement
case DotMatch.UnixLines =>
CharSet.fromRange(Lit('\n'.u)).complement
CharSet.fromRange(Lit('\n')).complement
}
}

Expand Down Expand Up @@ -463,7 +445,7 @@ object RegexParser {
*/
private def parseLiteralRegex(regex: String): ParsedRegex = {
val literals: Seq[RegexTree.Lit] = regex.map { char =>
RegexTree.Lit(UnicodeChar.fromChar(char))
RegexTree.Lit(char)
}
new ParsedRegex(regex, RegexTree.Juxt(literals), Normalization.NoNormalization)
}
Expand Down
Loading

0 comments on commit 8cd9867

Please sign in to comment.