Skip to content

Commit

Permalink
Migrate to Java: atoms
Browse files Browse the repository at this point in the history
  • Loading branch information
marianobarrios committed Mar 19, 2023
1 parent 2853db1 commit 6ef55d4
Show file tree
Hide file tree
Showing 10 changed files with 186 additions and 119 deletions.
9 changes: 4 additions & 5 deletions src/main/scala/dregex/Universe.scala
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
package dregex

import dregex.impl.RegexTree
import dregex.impl.CharInterval
import dregex.impl.Normalization
import dregex.impl.{CharInterval, Normalization, RegexTree}
import scala.jdk.CollectionConverters._

/**
* The purpose of this class is to enforce that set operation between regular expressions are only done when it is
Expand All @@ -22,8 +21,8 @@ class Universe(parsedTrees: Seq[RegexTree.Node], val normalization: Normalizatio

import RegexTree._

private[dregex] val alphabet: Map[AbstractRange, Seq[CharInterval]] = {
CharInterval.calculateNonOverlapping(parsedTrees.flatMap(t => collect(t)))
private[dregex] val alphabet: java.util.Map[AbstractRange, java.util.List[CharInterval]] = {
CharInterval.calculateNonOverlapping(parsedTrees.flatMap(t => collect(t)).asJava)
}

/**
Expand Down
2 changes: 1 addition & 1 deletion src/main/scala/dregex/extra/DotFormatter.scala
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ object DotFormatter {
}
val transitions = for (transition <- nfa.transitions) yield {
val weight =
if (transition.char == Epsilon)
if (transition.char == new Epsilon())
1
else
2
Expand Down
7 changes: 7 additions & 0 deletions src/main/scala/dregex/impl/AtomPart.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package dregex.impl;

/**
* A single or null char, i.e., including epsilon values
*/
public interface AtomPart {
}
84 changes: 84 additions & 0 deletions src/main/scala/dregex/impl/CharInterval.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package dregex.impl;

import scala.math.Ordered;

import java.util.*;

public final class CharInterval implements AtomPart, Ordered<CharInterval> {

public final UnicodeChar from;
public final UnicodeChar to;

public CharInterval(UnicodeChar from, UnicodeChar to) {
if (from == null) {
throw new NullPointerException("from is null");
}
if (to == null) {
throw new NullPointerException("to is null");
}

if (from.compare(to) > 0) {
throw new IllegalArgumentException("from value cannot be larger than to");
}
this.from = from;
this.to = to;
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
CharInterval that = (CharInterval) o;
return Objects.equals(from, that.from) && Objects.equals(to, that.to);
}

@Override
public int hashCode() {
return Objects.hash(from, to);
}

@Override
public int compare(CharInterval that) {
return this.from.compare(that.from);
}

public String toString() {
if (from.equals(to)) {
return from.toString();
} else {
return String.format("[%s-%s]", from, to);
}
}

public static Map<RegexTree.AbstractRange, List<CharInterval>> calculateNonOverlapping(List<RegexTree.AbstractRange> ranges) {
Set<UnicodeChar> startSet = new HashSet<>();
Set<UnicodeChar> endSet = new HashSet<>();
for (var range : ranges) {
startSet.add(range.from());
if (range.from().compare(UnicodeChar.min()) > 0) {
endSet.add(range.from().$minus(1));
}
endSet.add(range.to());
if (range.to().compare(UnicodeChar.max()) < 0) {
startSet.add(range.to().$plus(1));
}
}
Map<RegexTree.AbstractRange, List<CharInterval>> ret = new HashMap<>();
for (var range : ranges) {
var startCopySet = new java.util.TreeSet<>(startSet);
var endCopySet = new java.util.TreeSet<>(endSet);
var startSubSet = startCopySet.subSet(range.from(), true, range.to(), true);
var endSubSet = endCopySet.subSet(range.from(), true, range.to(), true);
assert startSubSet.size() == endSubSet.size();
List<CharInterval> res = new ArrayList<>(startSubSet.size());
do {
var start = startSubSet.pollFirst();
var end = endSubSet.pollFirst();
res.add(new CharInterval(start, end));
} while (!startSubSet.isEmpty());
ret.put(range, res);
}
return ret;
}

}
42 changes: 22 additions & 20 deletions src/main/scala/dregex/impl/Compiler.scala
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@ package dregex.impl

import dregex.InvalidRegexException

import java.util.stream.Collectors
import scala.collection.mutable.Buffer
import scala.jdk.CollectionConverters._

/**
* Take a regex AST and produce a NFA.
* Except when noted the Thompson-McNaughton-Yamada algorithm is used.
* Reference: http://stackoverflow.com/questions/11819185/steps-to-creating-an-nfa-from-a-regular-expression
*/
class Compiler(intervalMapping: Map[RegexTree.AbstractRange, Seq[CharInterval]]) {
class Compiler(intervalMapping: java.util.Map[RegexTree.AbstractRange, java.util.List[CharInterval]]) {

import RegexTree._

Expand All @@ -30,8 +32,8 @@ class Compiler(intervalMapping: Map[RegexTree.AbstractRange, Seq[CharInterval]])
// base case

case range: AbstractRange =>
val intervals = intervalMapping(range)
intervals.map(interval => NfaTransition(from, to, interval))
val intervals = intervalMapping.get(range)
intervals.stream().map(interval => NfaTransition(from, to, interval)).collect(Collectors.toList()).asScala.toSeq

// recurse

Expand Down Expand Up @@ -141,7 +143,7 @@ class Compiler(intervalMapping: Map[RegexTree.AbstractRange, Seq[CharInterval]])
private def processJuxtNoLookaround(juxt: Juxt, from: SimpleState, to: SimpleState): Seq[NfaTransition] = {
juxt match {
case Juxt(Seq()) =>
Seq(NfaTransition(from, to, Epsilon))
Seq(NfaTransition(from, to, new Epsilon()))

case Juxt(Seq(head)) =>
fromTreeImpl(head, from, to)
Expand Down Expand Up @@ -178,7 +180,7 @@ class Compiler(intervalMapping: Map[RegexTree.AbstractRange, Seq[CharInterval]])
fromTreeImpl(value, from, to)

case Rep(0, Some(0), value) =>
Seq(NfaTransition(from, to, Epsilon))
Seq(NfaTransition(from, to, new Epsilon()))

// infinite repetitions

Expand All @@ -190,18 +192,18 @@ class Compiler(intervalMapping: Map[RegexTree.AbstractRange, Seq[CharInterval]])
val int1 = new SimpleState
val int2 = new SimpleState
fromTreeImpl(value, int1, int2) :+
NfaTransition(from, int1, Epsilon) :+
NfaTransition(int2, to, Epsilon) :+
NfaTransition(int2, int1, Epsilon)
NfaTransition(from, int1, new Epsilon()) :+
NfaTransition(int2, to, new Epsilon()) :+
NfaTransition(int2, int1, new Epsilon())

case Rep(0, None, value) =>
val int1 = new SimpleState
val int2 = new SimpleState
fromTreeImpl(value, int1, int2) :+
NfaTransition(from, int1, Epsilon) :+
NfaTransition(int2, to, Epsilon) :+
NfaTransition(from, to, Epsilon) :+
NfaTransition(int2, int1, Epsilon)
NfaTransition(from, int1, new Epsilon()) :+
NfaTransition(int2, to, new Epsilon()) :+
NfaTransition(from, to, new Epsilon()) :+
NfaTransition(int2, int1, new Epsilon())

// finite repetitions

Expand All @@ -219,11 +221,11 @@ class Compiler(intervalMapping: Map[RegexTree.AbstractRange, Seq[CharInterval]])
for (i <- 1 until m - 1) {
val int = new SimpleState
transitions ++= fromTreeImpl(value, prev, int)
transitions += NfaTransition(prev, to, Epsilon)
transitions += NfaTransition(prev, to, new Epsilon())
prev = int
}
transitions ++= fromTreeImpl(value, prev, to)
transitions += NfaTransition(prev, to, Epsilon)
transitions += NfaTransition(prev, to, new Epsilon())
transitions.to(Seq)

case Rep(0, Some(m), value) if m > 0 =>
Expand All @@ -233,11 +235,11 @@ class Compiler(intervalMapping: Map[RegexTree.AbstractRange, Seq[CharInterval]])
for (i <- 0 until m - 1) {
val int = new SimpleState
transitions ++= fromTreeImpl(value, prev, int)
transitions += NfaTransition(prev, to, Epsilon)
transitions += NfaTransition(prev, to, new Epsilon())
prev = int
}
transitions ++= fromTreeImpl(value, prev, to)
transitions += NfaTransition(prev, to, Epsilon)
transitions += NfaTransition(prev, to, new Epsilon())
transitions.to(Seq)

}
Expand All @@ -254,16 +256,16 @@ class Compiler(intervalMapping: Map[RegexTree.AbstractRange, Seq[CharInterval]])
val result =
DfaAlgorithms.toNfa(operation(leftDfa, rightDfa))
result.transitions ++
result.accepting.to(Seq).map(acc => NfaTransition(acc, to, Epsilon)) :+
NfaTransition(from, result.initial, Epsilon)
result.accepting.to(Seq).map(acc => NfaTransition(acc, to, new Epsilon())) :+
NfaTransition(from, result.initial, new Epsilon())
}

def processCaptureGroup(value: Node, from: SimpleState, to: SimpleState): Seq[NfaTransition] = {
val int1 = new SimpleState
val int2 = new SimpleState
fromTreeImpl(value, int1, int2) :+
NfaTransition(from, int1, Epsilon) :+
NfaTransition(int2, to, Epsilon)
NfaTransition(from, int1, new Epsilon()) :+
NfaTransition(int2, to, new Epsilon())
}

}
6 changes: 3 additions & 3 deletions src/main/scala/dregex/impl/DfaAlgorithms.scala
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ object DfaAlgorithms {
@tailrec
def followEpsilonImpl(current: Set[State]): MultiState = {
val immediate = for (state <- current) yield {
transitionMap.getOrElse(state, Map()).getOrElse(Epsilon, Set())
transitionMap.getOrElse(state, Map()).getOrElse(new Epsilon(), Set())
}
val expanded = immediate.fold(current)(_ union _)
if (expanded == current)
Expand Down Expand Up @@ -229,7 +229,7 @@ object DfaAlgorithms {

def reverse[A <: State](dfa: Dfa[A]): Nfa = {
val initial: State = new SimpleState
val first = dfa.accepting.to(Seq).map(s => NfaTransition(initial, s, Epsilon))
val first = dfa.accepting.to(Seq).map(s => NfaTransition(initial, s, new Epsilon()))
val rest = for {
(from, fn) <- dfa.defTransitions
(char, to) <- fn
Expand Down Expand Up @@ -282,7 +282,7 @@ object DfaAlgorithms {
val char = UnicodeChar(codePoint)
val currentTrans = dfa.defTransitions.getOrElse(current, SortedMap[CharInterval, A]())
// O(log transitions) search in the range tree
val newState = Util.floorEntry(currentTrans, CharInterval(from = char, to = char)).flatMap {
val newState = Util.floorEntry(currentTrans, new CharInterval(char, char)).flatMap {
case (interval, state) =>
if (interval.to >= char) {
Some(state)
Expand Down
18 changes: 18 additions & 0 deletions src/main/scala/dregex/impl/Epsilon.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
package dregex.impl;

public final class Epsilon implements AtomPart {

public String toString() {
return "ε";
}

@Override
public boolean equals(Object other) {
return getClass().equals(other.getClass());
}

@Override
public int hashCode() {
return getClass().hashCode();
}
}
65 changes: 0 additions & 65 deletions src/main/scala/dregex/impl/atoms.scala

This file was deleted.

Loading

0 comments on commit 6ef55d4

Please sign in to comment.