Skip to content

Commit

Permalink
Migrate to Java: PredefinedPosixCharSets
Browse files Browse the repository at this point in the history
  • Loading branch information
marianobarrios committed Apr 7, 2023
1 parent 7782b8e commit 46e94d9
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 88 deletions.
52 changes: 1 addition & 51 deletions src/main/scala/dregex/impl/PredefinedCharSets.scala
Original file line number Diff line number Diff line change
Expand Up @@ -14,56 +14,6 @@ object PredefinedCharSets {

private[this] val logger = LoggerFactory.getLogger(PredefinedCharSets.getClass)

val unicodeBlocks: Map[String, CharSet] = {
val ret = collection.mutable.Map[String, CharSet]()
for ((block, range) <- UnicodeDatabase.blockRanges.asScala) {
val charSet = CharSet.fromRange(CharRange(range.from, range.to))
ret.put(UnicodeDatabaseReader.canonicalizeBlockName(block), charSet)
}
for ((block, alias) <- UnicodeDatabase.blockSynonyms.asScala) {
ret.put(UnicodeDatabaseReader.canonicalizeBlockName(alias), ret(UnicodeDatabaseReader.canonicalizeBlockName(block)))
}
ret.toMap
}

val unicodeScripts: Map[String, CharSet] = {
val ret = collection.mutable.Map[String, CharSet]()
for ((block, ranges) <- UnicodeDatabase.scriptRanges.asScala) {
val chatSet = CharSet(ranges.asScala.toSeq.map(range => CharRange(range.from, range.to)))
ret.put(block.toUpperCase, chatSet)
}
for ((script, alias) <- UnicodeDatabase.scriptSynomyms.asScala) {
ret.put(alias.toUpperCase, ret(script.toUpperCase))
}
ret.toMap
}

val lower = CharSet.fromRange(CharRange(from = 'a', to = 'z'))
val upper = CharSet.fromRange(CharRange(from = 'A', to = 'Z'))
val alpha = CharSet.fromCharSets(lower, upper)
val digit = CharSet.fromRange(CharRange(from = '0', to = '9'))
val alnum = CharSet.fromCharSets(alpha, digit)
val punct = CharSet("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~""".map(char => Lit(char)))
val graph = CharSet.fromCharSets(alnum, punct)
val space = CharSet(Seq(Lit('\n'), Lit('\t'), Lit('\r'), Lit('\f'), Lit(' '), Lit(0x0B)))
val wordChar = CharSet(alnum.ranges :+ Lit('_'))

val posixClasses = Map(
"Lower" -> lower,
"Upper" -> upper,
"ASCII" -> CharSet.fromRange(CharRange(from = 0, to = 0x7F)),
"Alpha" -> alpha,
"Digit" -> digit,
"Alnum" -> alnum,
"Punct" -> punct,
"Graph" -> graph,
"Print" -> CharSet(graph.ranges :+ Lit(0x20)),
"Blank" -> CharSet(Seq(Lit(0x20), Lit('\t'))),
"Cntrl" -> CharSet(Seq(CharRange(from = 0, to = 0x1F), Lit(0x7F))),
"XDigit" -> CharSet(digit.ranges ++ Seq(CharRange(from = 'a', to = 'f'), CharRange(from = 'A', to = 'F'))),
"Space" -> space
)

// Unicode version of POSIX-defined character classes

val unicodeDigit = unicodeBinaryProperties("DIGIT")
Expand Down Expand Up @@ -101,7 +51,7 @@ object PredefinedCharSets {
val unicodePosixClasses = Map(
"Lower" -> unicodeBinaryProperties("LOWERCASE"),
"Upper" -> unicodeBinaryProperties("UPPERCASE"),
"ASCII" -> posixClasses("ASCII"),
"ASCII" -> PredefinedPosixCharSets.classes.get("ASCII"),
"Alpha" -> unicodeBinaryProperties("ALPHABETIC"),
"Digit" -> unicodeDigit,
"Alnum" -> CharSet.fromCharSets(unicodeBinaryProperties("ALPHABETIC"), unicodeDigit),
Expand Down
44 changes: 44 additions & 0 deletions src/main/scala/dregex/impl/PredefinedPosixCharSets.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
package dregex.impl;

import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static dregex.impl.RegexTree.CharSet$;
import static dregex.impl.RegexTree.CharSet;
import static dregex.impl.RegexTree.Lit;
import static dregex.impl.RegexTree.CharRange;

public class PredefinedPosixCharSets {

public static CharSet lower = CharSet$.MODULE$.fromRange(new CharRange('a', 'z'));
public static CharSet upper = CharSet$.MODULE$.fromRange(new CharRange('A', 'Z'));
public static CharSet alpha = CharSet$.MODULE$.fromCharSetsJava(List.of(lower, upper));
public static CharSet digit = CharSet$.MODULE$.fromRange(new CharRange('0', '9'));
public static CharSet alnum = CharSet$.MODULE$.fromCharSetsJava(List.of(alpha, digit));
public static CharSet punct = CharSet$.MODULE$.fromJava(
"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~".chars().mapToObj(ch -> new Lit(ch)).collect(Collectors.toList()));
public static CharSet graph = CharSet$.MODULE$.fromCharSetsJava(List.of(alnum, punct));
public static CharSet space = CharSet$.MODULE$.fromJava(List.of(new Lit('\n'), new Lit('\t'),
new Lit('\r'), new Lit('\f'), new Lit(' '), new Lit(0x0B)));
public static CharSet wordChar = CharSet$.MODULE$.fromJava(
Stream.concat(alnum.javaRanges().stream(), Stream.of(new Lit('_'))).collect(Collectors.toList()));

public static Map<String, CharSet> classes = Map.ofEntries(
Map.entry("Lower", lower),
Map.entry("Upper", upper),
Map.entry("ASCII", CharSet$.MODULE$.fromRange(new CharRange(0, 0x7F))),
Map.entry("Alpha", alpha),
Map.entry("Digit", digit),
Map.entry("Alnum", alnum),
Map.entry("Punct", punct),
Map.entry("Graph", graph),
Map.entry("Print", CharSet$.MODULE$.fromJava(Stream.concat(graph.javaRanges().stream(),
Stream.of(new Lit(0x20))).collect(Collectors.toList()))),
Map.entry("Blank", CharSet$.MODULE$.fromJava(List.of(new Lit(0x20), new Lit('\t')))),
Map.entry("Cntrl", CharSet$.MODULE$.fromJava(List.of(new CharRange(0, 0x1F), new Lit(0x7F)))),
Map.entry("XDigit", CharSet$.MODULE$.fromJava(Stream.concat(digit.javaRanges().stream(),
Stream.of(new CharRange('a','f'), new CharRange('A', 'F'))).collect(Collectors.toList()))),
Map.entry("Space", space));
}
38 changes: 24 additions & 14 deletions src/main/scala/dregex/impl/RegexParser.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import dregex.{InvalidRegexException, ParsedRegex}
import dregex.impl.RegexParser.DotMatch

import scala.util.parsing.combinator.RegexParsers
import scala.jdk.CollectionConverters._

class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean) extends RegexParsers {

Expand Down Expand Up @@ -137,11 +138,17 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
case propName ~ _ ~ propValue =>
if (propName == "block" || propName == "blk") {
val canonicalBlockName = UnicodeDatabaseReader.canonicalizeBlockName(propValue)
PredefinedCharSets.unicodeBlocks
.getOrElse(canonicalBlockName, throw new InvalidRegexException("Invalid Unicode block: " + propValue))
val block = UnicodeBlocks.unicodeBlocks.get(canonicalBlockName)
if (block == null) {
throw new InvalidRegexException("Invalid Unicode block: " + propValue)
}
block
} else if (propName == "script" || propName == "sc") {
PredefinedCharSets.unicodeScripts
.getOrElse(propValue.toUpperCase(), throw new InvalidRegexException("Invalid Unicode script: " + propValue))
val script = UnicodeScripts.unicodeScripts.get(propValue.toUpperCase())
if (script == null) {
throw new InvalidRegexException("Invalid Unicode script: " + propValue)
}
script
} else if (propName == "general_category" || propName == "gc") {
PredefinedCharSets.unicodeGeneralCategories
.getOrElse(propValue, throw new InvalidRegexException("Invalid Unicode general category: " + propValue))
Expand All @@ -155,7 +162,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
* If the property starts with "Is" it could be either a script,
* general category or a binary property. Look for all.
*/
PredefinedCharSets.unicodeScripts
UnicodeScripts.unicodeScripts.asScala
.get(name.toUpperCase())
.orElse(PredefinedCharSets.unicodeGeneralCategories.get(name))
.orElse(PredefinedCharSets.unicodeBinaryProperties.get(name.toUpperCase()))
Expand All @@ -165,8 +172,11 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
}

def specialCharSetWithIn = backslash ~ "p" ~ "{" ~ "In" ~> unicodeSubsetName <~ "}" ^^ { blockName =>
PredefinedCharSets.unicodeBlocks
.getOrElse(UnicodeDatabaseReader.canonicalizeBlockName(blockName), throw new InvalidRegexException("Invalid Unicode block: " + blockName))
val block = UnicodeBlocks.unicodeBlocks.get(UnicodeDatabaseReader.canonicalizeBlockName(blockName))
if (block == null) {
throw new InvalidRegexException("Invalid Unicode block: " + blockName)
}
block
}

def specialCharSetWithJava = backslash ~ "p" ~ "{" ~ "java" ~> unicodeSubsetName <~ "}" ^^ { charClass =>
Expand All @@ -184,7 +194,7 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
if (unicodeClasses) {
PredefinedCharSets.unicodePosixClasses
} else {
PredefinedCharSets.posixClasses
PredefinedPosixCharSets.classes.asScala
}
}
effPosixClasses.get(name).orElse(PredefinedCharSets.unicodeGeneralCategories.get(name)).getOrElse {
Expand Down Expand Up @@ -228,42 +238,42 @@ class RegexParser(comments: Boolean, dotMatch: DotMatch, unicodeClasses: Boolean
if (unicodeClasses)
PredefinedCharSets.unicodeDigit
else
PredefinedCharSets.digit
PredefinedPosixCharSets.digit
}

def shorthandCharSetDigitCompl = backslash ~ "D" ^^^ {
if (unicodeClasses)
PredefinedCharSets.unicodeDigit.complement
else
PredefinedCharSets.digit.complement
PredefinedPosixCharSets.digit.complement
}

def shorthandCharSetSpace = backslash ~ "s" ^^^ {
if (unicodeClasses)
PredefinedCharSets.unicodeSpace
else
PredefinedCharSets.space
PredefinedPosixCharSets.space
}

def shorthandCharSetSpaceCompl = backslash ~ "S" ^^^ {
if (unicodeClasses)
PredefinedCharSets.unicodeSpace.complement
else
PredefinedCharSets.space.complement
PredefinedPosixCharSets.space.complement
}

def shorthandCharSetWord = backslash ~ "w" ^^^ {
if (unicodeClasses)
PredefinedCharSets.unicodeWordChar
else
PredefinedCharSets.wordChar
PredefinedPosixCharSets.wordChar
}

def shorthandCharSetWordCompl = backslash ~ "W" ^^^ {
if (unicodeClasses)
PredefinedCharSets.unicodeWordChar.complement
else
PredefinedCharSets.wordChar.complement
PredefinedPosixCharSets.wordChar.complement
}

def charClass = "[" ~> "^".? ~ "-".? ~ charClassAtom.+ ~ "-".? <~ "]" ^^ {
Expand Down
9 changes: 9 additions & 0 deletions src/main/scala/dregex/impl/RegexTree.scala
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package dregex.impl

import scala.runtime.ScalaRunTime
import scala.jdk.CollectionConverters._

sealed trait Direction
object Direction {
Expand Down Expand Up @@ -109,11 +110,19 @@ object RegexTree {
def canonical = this
def precedence = 1
override def toString = s"${getClass.getSimpleName}(${ranges.mkString(",")})"
def javaRanges() = ranges.asJava
}

object CharSet {
def fromCharSets(charSets: CharSet*): CharSet = CharSet(charSets.to(Seq).flatMap(_.ranges))

def fromCharSetsJava(charSets: java.util.List[CharSet]) = fromCharSets(charSets.asScala.toSeq: _*)

def fromRange(interval: AbstractRange) = CharSet(Seq(interval))

def fromJava(ranges: java.util.List[AbstractRange]): CharSet = {
CharSet(ranges.asScala.toSeq)
}
}

case class Disj(values: Seq[Node]) extends ComplexPart {
Expand Down
42 changes: 42 additions & 0 deletions src/main/scala/dregex/impl/UnicodeBlocks.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package dregex.impl;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

public class UnicodeBlocks {

private static final Map<String, UnicodeDatabaseReader.Range> ranges;

static {
try (var blocksFile = UnicodeBlocks.class.getResourceAsStream("/Blocks.txt")) {
ranges = UnicodeDatabaseReader.getBlocks(new InputStreamReader(blocksFile));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

private static final Map<String, String> synonyms = Map.of(
"Greek and Coptic", "Greek"
);

public static final Map<String, RegexTree.CharSet> unicodeBlocks;

static {
unicodeBlocks = new HashMap<>();
for (var entry : ranges.entrySet()) {
var block = entry.getKey();
var range = entry.getValue();
var charSet = RegexTree.CharSet$.MODULE$.fromRange(new RegexTree.CharRange(range.from, range.to));
unicodeBlocks.put(UnicodeDatabaseReader.canonicalizeBlockName(block), charSet);
}
for (var entry : synonyms.entrySet()) {
var block = entry.getKey();
var alias = entry.getValue();
unicodeBlocks.put(UnicodeDatabaseReader.canonicalizeBlockName(alias),
unicodeBlocks.get(UnicodeDatabaseReader.canonicalizeBlockName(block)));
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -2,36 +2,24 @@

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;

public class UnicodeDatabase {
public class UnicodeScripts {

public static Map<String, UnicodeDatabaseReader.Range> blockRanges;
private static final Map<String, List<UnicodeDatabaseReader.Range>> ranges;

static {
try (var blocksFile = UnicodeDatabase.class.getResourceAsStream("/Blocks.txt")) {
blockRanges = UnicodeDatabaseReader.getBlocks(new InputStreamReader(blocksFile));
try (var scriptsFile = UnicodeScripts.class.getResourceAsStream("/Scripts.txt")) {
ranges = UnicodeDatabaseReader.getScripts(new InputStreamReader(scriptsFile));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

public static Map<String, String> blockSynonyms = Map.of(
"Greek and Coptic", "Greek"
);

public static Map<String, List<UnicodeDatabaseReader.Range>> scriptRanges;

static {
try (var scriptsFile = UnicodeDatabase.class.getResourceAsStream("/Scripts.txt")) {
scriptRanges = UnicodeDatabaseReader.getScripts(new InputStreamReader(scriptsFile));
} catch (IOException e) {
throw new RuntimeException(e);
}
}

public static Map<String, String> scriptSynomyms = Map.ofEntries(
private static final Map<String, String> synomyms = Map.ofEntries(
Map.entry("COMMON", "ZYYY"),
Map.entry("LATIN", "LATN"),
Map.entry("GREEK", "GREK"),
Expand Down Expand Up @@ -189,4 +177,23 @@ public class UnicodeDatabase {
Map.entry("DIVES_AKURU", "DIAK"),
Map.entry("KHITAN_SMALL_SCRIPT", "KITS"));


public static final Map<String, RegexTree.CharSet> unicodeScripts;

static {
unicodeScripts = new HashMap<>();
for (var entry : ranges.entrySet()) {
var block = entry.getKey();
var ranges = entry.getValue();
var chatSet = RegexTree.CharSet$.MODULE$.fromJava(ranges.stream().map(
range -> new RegexTree.CharRange(range.from, range.to)).collect(Collectors.toList()));
unicodeScripts.put(block.toUpperCase(), chatSet);
}
for (var entry : synomyms.entrySet()) {
var script = entry.getKey();
var alias = entry.getValue();
unicodeScripts.put(alias.toUpperCase(), unicodeScripts.get(script.toUpperCase()));
}
}

}
4 changes: 2 additions & 2 deletions src/test/scala/dregex/TreeGenerator.scala
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package dregex

import dregex.impl.RegexTree
import dregex.impl.PredefinedCharSets
import dregex.impl.PredefinedCharSets2

/**
* Generate some sample regex trees, useful for testing.
Expand All @@ -17,7 +17,7 @@ class TreeGenerator {
Wildcard,
CharSet.fromRange(CharRange('d', 'f')),
CharSet.fromRange(CharRange('d', 'f')).complement,
PredefinedCharSets.digit)
PredefinedCharSets2.digit)
} else {
generateFixedDepth(levels - 1).flatMap { node =>
val simple = Iterator(
Expand Down
Loading

0 comments on commit 46e94d9

Please sign in to comment.