diff --git a/app/github/release/baselineProfiles/0/app-github-release.dm b/app/github/release/baselineProfiles/0/app-github-release.dm new file mode 100644 index 00000000..ed380121 Binary files /dev/null and b/app/github/release/baselineProfiles/0/app-github-release.dm differ diff --git a/app/github/release/baselineProfiles/1/app-github-release.dm b/app/github/release/baselineProfiles/1/app-github-release.dm new file mode 100644 index 00000000..fcee67dd Binary files /dev/null and b/app/github/release/baselineProfiles/1/app-github-release.dm differ diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index deb93c5a..cb6a70c4 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -1,28 +1,27 @@ [versions] -ahocorasickVersion="0.6.3" androidGifDrawableVersion="1.2.23" androidsvgAarVersion="1.4" -appcompatVersion="1.6.1" +appcompatVersion="1.7.0" bcprovJdk15onVersion="1.70" coil="3.0.0-alpha06" coreKtxVersion="1.13.1" exoplayerVersion="2.19.1" -firebaseCrashlyticsGradleVersion="2.9.9" -fragmentKtxVersion="1.7.0" +firebaseCrashlyticsGradleVersion="3.0.1" +fragmentKtxVersion="1.8.0" glideVersion="4.16.0" -googleServicesVersion="4.4.1" +googleServicesVersion="4.4.2" gradle= "8.3.2" accompanistDrawablepainterVersion = "0.34.0" -cameraCoreVersion = "1.4.0-alpha04" +cameraCoreVersion = "1.4.0-beta02" apollo = "3.2.1" -compose = "1.6.7" +compose = "1.6.8" activityComposeVersion = "1.9.0" -composeBom = "2024.05.00" +composeBom = "2024.06.00" coreSplashscreenVersion = "1.0.1" coreVersion = "3.5.3" datastorePreferencesVersion = "1.1.1" -firebaseCrashlyticsKtxVersion = "18.6.3" -firebaseBomVersion = "32.8.0" +firebaseCrashlyticsKtxVersion = "19.0.1" +firebaseBomVersion = "33.1.0" gsonVersion="2.10.1" guavaVersion="33.0.0-jre" jsoupVersion="1.15.3" @@ -34,12 +33,12 @@ kotlinxCoroutinesVersion ="1.8.1" ktor = "3.0.0-beta-1" leakcanaryAndroidVersion = "2.12" lifecycleExtensionsVersion="2.2.0" -lifecycleViewmodelKtxVersion="2.8.0" +lifecycleViewmodelKtxVersion="2.8.2" markwon="4.6.2" materialVersion="1.12.0" media3 = "1.3.1" navigationComposeVersion = "2.7.7" -materialIconsExtendedVersion = "1.7.0-alpha05" +materialIconsExtendedVersion = "1.7.0-beta03" material3Version = "1.2.1" okhttpVersion="4.12.0" openaiClientVersion = "3.6.2" @@ -50,14 +49,13 @@ room = "2.6.1" snakeYamlVersion = "v1.18-android" subsamplingScaleImageViewAndroidxVersion="3.10.0" transitionVersion="1.5.0" -viewpager2Version = "1.0.0" +viewpager2Version = "1.1.0" workRuntimeKtxVersion = "2.9.0" ztZipVersion = "1.16" devtoolsKspVersion = "1.9.23-1.0.20" pagingVersion = "3.3.0" [libraries] -ahocorasick = { module = "org.ahocorasick:ahocorasick", version.ref = "ahocorasickVersion" } android-gif-drawable = { module = "pl.droidsonroids.gif:android-gif-drawable", version.ref = "androidGifDrawableVersion" } androidsvg-aar = { module = "com.caverock:androidsvg-aar", version.ref = "androidsvgAarVersion" } androidx-appcompat = { module = "androidx.appcompat:appcompat", version.ref = "appcompatVersion" } diff --git a/lib/build.gradle.kts b/lib/build.gradle.kts index 001f726e..d387f4eb 100644 --- a/lib/build.gradle.kts +++ b/lib/build.gradle.kts @@ -79,7 +79,6 @@ dependencies { // https://github.com/davemorrissey/subsampling-scale-image-view api(libs.subsampling.scale.image.view.androidx) - implementation(libs.ahocorasick) // For pinyin implementation(libs.bcprov.jdk15on) implementation(libs.bcpkix.jdk15on) api(libs.ktor.client.core) diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/Interval.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/Interval.java new file mode 100644 index 00000000..c1c7d97d --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/Interval.java @@ -0,0 +1,105 @@ +package com.ismartcoding.lib.ahocorasick.interval; + + +/** + * Responsible for tracking the start and end bounds, which are reused by + * both {@link Emit} and {@link PayloadEmit}. + */ +public class Interval implements Intervalable { + + private final int start; + private final int end; + + /** + * Constructs an interval with a start and end position. + * + * @param start The interval's starting text position. + * @param end The interval's ending text position. + */ + public Interval(final int start, final int end) { + this.start = start; + this.end = end; + } + + /** + * Returns the starting offset into the text for this interval. + * + * @return A number between 0 (start of text) and the text length. + */ + @Override + public int getStart() { + return this.start; + } + + /** + * Returns the ending offset into the text for this interval. + * + * @return A number between getStart() + 1 and the text length. + */ + @Override + public int getEnd() { + return this.end; + } + + /** + * Returns the length of the interval. + * + * @return The end position less the start position, plus one. + */ + @Override + public int size() { + return end - start + 1; + } + + /** + * Answers whether the given interval overlaps this interval + * instance. + * + * @param other the other interval to check for overlap + * @return true The intervals overlap. + */ + public boolean overlapsWith(final Interval other) { + return this.start <= other.getEnd() && + this.end >= other.getStart(); + } + + public boolean overlapsWith(int point) { + return this.start <= point && point <= this.end; + } + + @Override + public boolean equals(Object o) { + if (!(o instanceof Intervalable)) { + return false; + } + Intervalable other = (Intervalable) o; + return this.start == other.getStart() && + this.end == other.getEnd(); + } + + @Override + public int hashCode() { + return this.start % 100 + this.end % 100; + } + + @Override + public int compareTo(Object o) { + if (!(o instanceof Intervalable)) { + return -1; + } + Intervalable other = (Intervalable) o; + int comparison = this.start - other.getStart(); + return comparison != 0 ? comparison : this.end - other.getEnd(); + } + + /** + * Returns the starting offset and ending offset separated + * by a full colon (:). + * + * @return A non-null String, never empty. + */ + @Override + public String toString() { + return this.start + ":" + this.end; + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalNode.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalNode.java new file mode 100644 index 00000000..1dbec481 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalNode.java @@ -0,0 +1,123 @@ +package com.ismartcoding.lib.ahocorasick.interval; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +public class IntervalNode { + + private enum Direction {LEFT, RIGHT} + + private IntervalNode left; + private IntervalNode right; + private int point; + private List intervals = new ArrayList<>(); + + public IntervalNode(final List intervals) { + this.point = determineMedian(intervals); + + final List toLeft = new ArrayList<>(); + final List toRight = new ArrayList<>(); + + for (Intervalable interval : intervals) { + if (interval.getEnd() < this.point) { + toLeft.add(interval); + } else if (interval.getStart() > this.point) { + toRight.add(interval); + } else { + this.intervals.add(interval); + } + } + + if (toLeft.size() > 0) { + this.left = new IntervalNode(toLeft); + } + if (toRight.size() > 0) { + this.right = new IntervalNode(toRight); + } + } + + public int determineMedian(final List intervals) { + int start = -1; + int end = -1; + for (Intervalable interval : intervals) { + int currentStart = interval.getStart(); + int currentEnd = interval.getEnd(); + if (start == -1 || currentStart < start) { + start = currentStart; + } + if (end == -1 || currentEnd > end) { + end = currentEnd; + } + } + return (start + end) / 2; + } + + public List findOverlaps(final Intervalable interval) { + final List overlaps = new ArrayList<>(); + + if (this.point < interval.getStart()) { + // Tends to the right + addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval)); + addToOverlaps(interval, overlaps, checkForOverlapsToTheRight(interval)); + } else if (this.point > interval.getEnd()) { + // Tends to the left + addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval)); + addToOverlaps(interval, overlaps, checkForOverlapsToTheLeft(interval)); + } else { + // Somewhere in the middle + addToOverlaps(interval, overlaps, this.intervals); + addToOverlaps(interval, overlaps, findOverlappingRanges(this.left, interval)); + addToOverlaps(interval, overlaps, findOverlappingRanges(this.right, interval)); + } + + return overlaps; + } + + protected void addToOverlaps( + final Intervalable interval, + final List overlaps, + final List newOverlaps) { + for (final Intervalable currentInterval : newOverlaps) { + if (!currentInterval.equals(interval)) { + overlaps.add(currentInterval); + } + } + } + + protected List checkForOverlapsToTheLeft(final Intervalable interval) { + return checkForOverlaps(interval, Direction.LEFT); + } + + protected List checkForOverlapsToTheRight(final Intervalable interval) { + return checkForOverlaps(interval, Direction.RIGHT); + } + + protected List checkForOverlaps( + final Intervalable interval, final Direction direction) { + final List overlaps = new ArrayList<>(); + + for (final Intervalable currentInterval : this.intervals) { + switch (direction) { + case LEFT: + if (currentInterval.getStart() <= interval.getEnd()) { + overlaps.add(currentInterval); + } + break; + case RIGHT: + if (currentInterval.getEnd() >= interval.getStart()) { + overlaps.add(currentInterval); + } + break; + } + } + + return overlaps; + } + + protected List findOverlappingRanges(IntervalNode node, Intervalable interval) { + return node == null + ? Collections.emptyList() + : node.findOverlaps(interval); + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalTree.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalTree.java new file mode 100644 index 00000000..f8a5ee23 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalTree.java @@ -0,0 +1,49 @@ +package com.ismartcoding.lib.ahocorasick.interval; + +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import static java.util.Collections.sort; + +public class IntervalTree { + + private final IntervalNode rootNode; + + public IntervalTree(List intervals) { + this.rootNode = new IntervalNode(intervals); + } + + public List removeOverlaps(final List intervals) { + + // Sort the intervals on size, then left-most position + sort(intervals, new IntervalableComparatorBySize()); + + final Set removeIntervals = new TreeSet<>(); + + for (final Intervalable interval : intervals) { + // If the interval was already removed, ignore it + if (removeIntervals.contains(interval)) { + continue; + } + + // Remove all overallping intervals + removeIntervals.addAll(findOverlaps(interval)); + } + + // Remove all intervals that were overlapping + for (final Intervalable removeInterval : removeIntervals) { + intervals.remove(removeInterval); + } + + // Sort the intervals, now on left-most position only + sort(intervals, new IntervalableComparatorByPosition()); + + return intervals; + } + + public List findOverlaps(final Intervalable interval) { + return rootNode.findOverlaps(interval); + } + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/Intervalable.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/Intervalable.java new file mode 100644 index 00000000..6e9afbb9 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/Intervalable.java @@ -0,0 +1,11 @@ +package com.ismartcoding.lib.ahocorasick.interval; + +public interface Intervalable extends Comparable { + + int getStart(); + + int getEnd(); + + int size(); + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalableComparatorByPosition.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalableComparatorByPosition.java new file mode 100644 index 00000000..a806ac4c --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalableComparatorByPosition.java @@ -0,0 +1,12 @@ +package com.ismartcoding.lib.ahocorasick.interval; + +import java.util.Comparator; + +public class IntervalableComparatorByPosition implements Comparator { + + @Override + public int compare(final Intervalable intervalable, final Intervalable intervalable2) { + return intervalable.getStart() - intervalable2.getStart(); + } + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalableComparatorBySize.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalableComparatorBySize.java new file mode 100644 index 00000000..88fae996 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/interval/IntervalableComparatorBySize.java @@ -0,0 +1,18 @@ +package com.ismartcoding.lib.ahocorasick.interval; + +import java.util.Comparator; + +public class IntervalableComparatorBySize implements Comparator { + + @Override + public int compare(final Intervalable intervalable, final Intervalable intervalable2) { + int comparison = intervalable2.size() - intervalable.size(); + + if (comparison == 0) { + comparison = intervalable.getStart() - intervalable2.getStart(); + } + + return comparison; + } + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/DefaultToken.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/DefaultToken.java new file mode 100644 index 00000000..909098ec --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/DefaultToken.java @@ -0,0 +1,21 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +public class DefaultToken extends Token { + + private PayloadToken payloadToken; + + public DefaultToken(PayloadToken payloadToken) { + super(payloadToken.getFragment()); + this.payloadToken = payloadToken; + } + + public boolean isMatch() { + return payloadToken.isMatch(); + } + + public Emit getEmit() { + PayloadEmit emit = payloadToken.getEmit(); + return new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); + } + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Emit.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Emit.java new file mode 100644 index 00000000..41cdf4d8 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Emit.java @@ -0,0 +1,27 @@ +package com.ismartcoding.lib.ahocorasick.trie; + + +import com.ismartcoding.lib.ahocorasick.interval.Interval; +import com.ismartcoding.lib.ahocorasick.interval.Intervalable; + +/** + * Responsible for tracking the bounds of matched terms. + */ +public class Emit extends Interval implements Intervalable { + private final String keyword; + + public Emit(final int start, final int end, final String keyword) { + super(start, end); + this.keyword = keyword; + } + + public String getKeyword() { + return this.keyword; + } + + @Override + public String toString() { + return super.toString() + "=" + this.keyword; + } + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/FragmentToken.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/FragmentToken.java new file mode 100644 index 00000000..ffb6cf09 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/FragmentToken.java @@ -0,0 +1,19 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +public class FragmentToken extends Token { + + public FragmentToken(String fragment) { + super(fragment); + } + + @Override + public boolean isMatch() { + return false; + } + + @Override + public Emit getEmit() { + return null; + } + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/MatchToken.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/MatchToken.java new file mode 100644 index 00000000..fa40a4ef --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/MatchToken.java @@ -0,0 +1,22 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +public class MatchToken extends Token { + + private final Emit emit; + + public MatchToken(final String fragment, final Emit emit) { + super(fragment); + this.emit = emit; + + } + + @Override + public boolean isMatch() { + return true; + } + + @Override + public Emit getEmit() { + return this.emit; + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Payload.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Payload.java new file mode 100644 index 00000000..a627e0ca --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Payload.java @@ -0,0 +1,32 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +/** + * Contains the matched keyword and some payload data. + * + * @author Daniel Beck + * @param The type of the wrapped payload data. + */ +public class Payload implements Comparable> { + + private final String keyword; + private final T data; + + public Payload(final String keyword, final T data) { + super(); + this.keyword = keyword; + this.data = data; + } + + public String getKeyword() { + return keyword; + } + + public T getData() { + return data; + } + + @Override + public int compareTo(Payload other) { + return keyword.compareTo(other.getKeyword()); + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadEmit.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadEmit.java new file mode 100644 index 00000000..d59ee8c3 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadEmit.java @@ -0,0 +1,49 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +import com.ismartcoding.lib.ahocorasick.interval.Interval; +import com.ismartcoding.lib.ahocorasick.interval.Intervalable; + +/** + * Contains a matched term and its associated payload data. + * + * @param Type of the wrapped payload-data. + * @author Daniel Beck + */ +public class PayloadEmit extends Interval implements Intervalable { + + private final String keyword; + + private final T payload; + + /** + * Created a PayloadEmit + * + * @param start Start of the matched search term. + * @param end End of the matched search term. + * @param keyword Keyword that matched. + * @param payload Emitted payload data. + */ + public PayloadEmit(final int start, final int end, String keyword, T payload) { + super(start, end); + this.keyword = keyword; + this.payload = payload; + } + + public String getKeyword() { + return this.keyword; + } + + /** + * Returns the payload associated to this emit. + * + * @return the associated payload + */ + public T getPayload() { + return this.payload; + } + + @Override + public String toString() { + return super.toString() + "=" + this.keyword + (this.payload != null ? "->" + this.payload : ""); + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadFragmentToken.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadFragmentToken.java new file mode 100644 index 00000000..35d1938e --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadFragmentToken.java @@ -0,0 +1,32 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +/*** + * Container for a token ("the fragment") that can emit a type of payload. + *

+ * This token indicates a matching search term was not found, so + * {@link #isMatch()} always returns {@code false}. + *

+ * + * @author Daniel Beck + * + * @param The Type of the emitted payloads. + */ +public class PayloadFragmentToken extends PayloadToken { + + public PayloadFragmentToken(String fragment) { + super(fragment); + } + + @Override + public boolean isMatch() { + return false; + } + + /** + * Returns null. + */ + @Override + public PayloadEmit getEmit() { + return null; + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadMatchToken.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadMatchToken.java new file mode 100644 index 00000000..1b19cefd --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadMatchToken.java @@ -0,0 +1,32 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +/** + * Container for a token ("the fragment") that can emit a type of payload. + *

+ * This token indicates a matching search term was found, so {@link #isMatch()} + * always returns {@code true}. + *

+ * + * @author Daniel Beck + * + * @param The Type of the emitted payloads. + */ +public class PayloadMatchToken extends PayloadToken { + + private final PayloadEmit emit; + + public PayloadMatchToken(final String fragment, final PayloadEmit emit) { + super(fragment); + this.emit = emit; + } + + @Override + public boolean isMatch() { + return true; + } + + @Override + public PayloadEmit getEmit() { + return this.emit; + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadState.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadState.java new file mode 100644 index 00000000..cbb4d6bb --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadState.java @@ -0,0 +1,144 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +import java.util.*; + +/** + *

+ * A state has various important tasks it must attend to: + *

+ *
    + *
  • success; when a character points to another state, it must return that + * state
  • + *
  • failure; when a character has no matching state, the algorithm must be + * able to fall back on a state with less depth
  • + *
  • emits; when this state is passed and keywords have been matched, the + * matches and their payloads must be 'emitted' so that they can be used later + * on.
  • + *
+ *

+ * The root state is special in the sense that it has no failure state; it + * cannot fail. If it 'fails' it will still parse the next character and start + * from the root node. This ensures that the algorithm always runs. All other + * states always have a fail state. + *

+ * + * @author Daniel Beck + */ +public class PayloadState { + + /** + * effective the size of the keyword + */ + private final int depth; + + /** + * only used for the root state to refer to itself in case no matches have been + * found + */ + private final PayloadState rootState; + + /** + * referred to in the white paper as the 'goto' structure. From a state it is + * possible to go to other states, depending on the character passed. + */ + private final Map> success = new HashMap<>(); + + /** + * if no matching states are found, the failure state will be returned + */ + private PayloadState failure; + + /** + * whenever this state is reached, it will emit the matches keywords for future + * reference + */ + private Set> emits; + + public PayloadState() { + this(0); + } + + public PayloadState(final int depth) { + this.depth = depth; + this.rootState = depth == 0 ? this : null; + } + + private PayloadState nextState(final Character character, final boolean ignoreRootState) { + PayloadState nextState = this.success.get(character); + + if (!ignoreRootState && nextState == null && this.rootState != null) { + nextState = this.rootState; + } + + return nextState; + } + + public PayloadState nextState(final Character character) { + return nextState(character, false); + } + + public PayloadState nextStateIgnoreRootState(Character character) { + return nextState(character, true); + } + + public PayloadState addState(Character character) { + PayloadState nextState = nextStateIgnoreRootState(character); + if (nextState == null) { + nextState = new PayloadState<>(this.depth + 1); + this.success.put(character, nextState); + } + return nextState; + } + + public int getDepth() { + return this.depth; + } + + /** + * Adds a payload to be emitted for this state. + * + * @param payload to be emitted. + */ + public void addEmit(Payload payload) { + if (this.emits == null) { + this.emits = new TreeSet<>(); + } + this.emits.add(payload); + } + + /** + * Adds a collection of payloads to be emitted for this state. + * + * @param emits Collection of payloads to be emitted. + */ + public void addEmit(Collection> emits) { + for (Payload emit : emits) { + addEmit(emit); + } + } + + /** + * Returns a collection of emitted payloads for this state. + * + * @return Collection of emitted payloads. + */ + public Collection> emit() { + return this.emits == null ? Collections.>emptyList() : this.emits; + } + + public PayloadState failure() { + return this.failure; + } + + public void setFailure(PayloadState failState) { + this.failure = failState; + } + + public Collection> getStates() { + return this.success.values(); + } + + public Collection getTransitions() { + return this.success.keySet(); + } +} \ No newline at end of file diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadToken.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadToken.java new file mode 100644 index 00000000..d5aee632 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadToken.java @@ -0,0 +1,32 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +/*** + * PayloadToken holds a text ("the fragment") an emits some output. If + * {@link #isMatch()} returns {@code true}, the token matched a search term. + * + * @author Daniel Beck + * + * @param The Type of the emitted payloads. + */ +public abstract class PayloadToken { + private String fragment; + + public PayloadToken(String fragment) { + this.fragment = fragment; + } + + public String getFragment() { + return this.fragment; + } + + /** + * Return {@code true} if a search term matched. + * @return {@code true} if this is a match + */ + public abstract boolean isMatch(); + + /** + * @return the payload + */ + public abstract PayloadEmit getEmit(); +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadTrie.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadTrie.java new file mode 100644 index 00000000..29b962bf --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/PayloadTrie.java @@ -0,0 +1,463 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +import static java.lang.Character.isWhitespace; + +import java.util.LinkedList; +import java.util.Collection; +import java.util.List; +import java.util.Queue; +import java.util.concurrent.LinkedBlockingDeque; + +import com.ismartcoding.lib.ahocorasick.interval.IntervalTree; +import com.ismartcoding.lib.ahocorasick.interval.Intervalable; +import com.ismartcoding.lib.ahocorasick.trie.handler.DefaultPayloadEmitHandler; +import com.ismartcoding.lib.ahocorasick.trie.handler.PayloadEmitHandler; +import com.ismartcoding.lib.ahocorasick.trie.handler.StatefulPayloadEmitHandler; + +/** + * A trie implementation that carries a payload. See {@link Trie} for + * details on usage. + * + *

+ * The payload trie adds the possibility to specify emitted payloads for each + * added keyword. + *

+ * + * @author Daniel Beck + * @param The type of the supplied of the payload. + */ +public class PayloadTrie { + + private final TrieConfig trieConfig; + + private final PayloadState rootState; + + protected PayloadTrie(final TrieConfig trieConfig) { + this.trieConfig = trieConfig; + this.rootState = new PayloadState<>(); + } + + /** + * Used by the builder to add a text search keyword with an emit payload. + * + * @param keyword The search term to add to the list of search terms. + * @param emit the payload to emit for this search term. + * @throws NullPointerException if the keyword is null. + */ + private void addKeyword(String keyword, T emit) { + if (keyword.isEmpty()) { + return; + } + + addState(keyword).addEmit(new Payload<>(keyword, emit)); + } + + /** + * Used by the builder to add a text search keyword. + * + * @param keyword The search term to add to the list of search terms. + * @throws NullPointerException if the keyword is null. + */ + private void addKeyword(String keyword) { + if (keyword.isEmpty()) { + return; + } + + addState(keyword).addEmit(new Payload<>(keyword, null)); + } + + private PayloadState addState(final String keyword) { + PayloadState state = getRootState(); + for (final Character character : keyword.toCharArray()) { + Character adjustedChar = isCaseInsensitive() ? Character.toLowerCase(character) : character; + state = state.addState(adjustedChar); + } + return state; + } + + /** + * Tokenizes the specified text and returns the emitted outputs. + * + * @param text The text to tokenize. + * @return the emitted outputs + */ + public Collection> tokenize(final String text) { + final Collection> tokens = new LinkedList<>(); + final Collection> collectedEmits = parseText(text); + int lastCollectedPosition = -1; + + for (final PayloadEmit emit : collectedEmits) { + if (emit.getStart() - lastCollectedPosition > 1) { + tokens.add( createFragment( emit, text, lastCollectedPosition) ); + } + + tokens.add(createMatch(emit, text)); + lastCollectedPosition = emit.getEnd(); + } + + if (text.length() - lastCollectedPosition > 1) { + tokens.add( createFragment( null, text, lastCollectedPosition) ); + } + + return tokens; + } + + private PayloadToken createFragment(final PayloadEmit emit, final String text, final int lastCollectedPosition) { + return new PayloadFragmentToken<>( + text.substring( lastCollectedPosition + 1, + emit == null ? text.length() : emit.getStart() ) ); + } + + private PayloadToken createMatch(PayloadEmit emit, String text) { + return new PayloadMatchToken<>( text.substring( emit.getStart(), + emit.getEnd() + 1 ), + emit ); + } + + /** + * Tokenizes a specified text and returns the emitted outputs. + * + * @param text The character sequence to tokenize. + * @return A collection of emits. + */ + public Collection> parseText(final CharSequence text) { + return parseText(text, new DefaultPayloadEmitHandler<>()); + } + + /** + * Tokenizes the specified text by using a custom EmitHandler and returns the + * emitted outputs. + * + * @param text The character sequence to tokenize. + * @param emitHandler The handler that will be used to parse the text. + * @return A collection of emits. + */ + @SuppressWarnings("unchecked") + public Collection> parseText(final CharSequence text, final StatefulPayloadEmitHandler emitHandler) { + parseText(text, (PayloadEmitHandler) emitHandler); + + final List> collectedEmits = emitHandler.getEmits(); + + if (!trieConfig.isAllowOverlaps()) { + IntervalTree intervalTree = new IntervalTree((List) (List) collectedEmits); + intervalTree.removeOverlaps((List) (List) collectedEmits); + } + + return collectedEmits; + } + + /** + * Returns true if the text contains one of the search terms; otherwise, + * returns false. + * + * @param text Specified text. + * @return true if the text contains one of the search terms. Else, returns + * false. + */ + public boolean containsMatch(final CharSequence text) { + return firstMatch(text) != null; + } + + /** + * Tokenizes the specified text by using a custom EmitHandler and returns the + * emitted outputs. + * + * @param text The character sequence to tokenize. + * @param emitHandler The handler that will be used to parse the text. + */ + public void parseText(final CharSequence text, final PayloadEmitHandler emitHandler) { + PayloadState currentState = getRootState(); + + for (int position = 0; position < text.length(); position++) { + char character = text.charAt( position); + + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + + currentState = getState(currentState, character); + final Collection> payloads = currentState.emit(); + if (processEmits(text, position, payloads, emitHandler) && trieConfig.isStopOnHit()) { + return; + } + } + } + + /** + * The first matching text sequence. + * + * @param text The text to search for keywords, must not be {@code null}. + * @return {@code null} if no matches found. + */ + public PayloadEmit firstMatch(final CharSequence text) { + assert text != null; + + if (!trieConfig.isAllowOverlaps()) { + // Slow path. Needs to find all the matches to detect overlaps. + final Collection> parseText = parseText(text); + + if (parseText != null && !parseText.isEmpty()) { + return parseText.iterator().next(); + } + } else { + // Fast path. Returns first match found. + PayloadState currentState = getRootState(); + + for (int position = 0; position < text.length(); position++) { + char character = text.charAt( position); + + if (trieConfig.isCaseInsensitive()) { + character = Character.toLowerCase(character); + } + + currentState = getState(currentState, character); + Collection> payloads = currentState.emit(); + + if (payloads != null && !payloads.isEmpty()) { + for (final Payload payload : payloads) { + final PayloadEmit emit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, position, + payload.getKeyword(), payload.getData()); + if (trieConfig.isOnlyWholeWords()) { + if (!isPartialMatch(text, emit)) { + return emit; + } + } else { + return emit; + } + } + } + } + } + + return null; + } + + private boolean isPartialMatch(final CharSequence searchText, final PayloadEmit emit) { + return (emit.getStart() != 0 && Character.isAlphabetic(searchText.charAt(emit.getStart() - 1))) + || (emit.getEnd() + 1 != searchText.length() && Character.isAlphabetic(searchText.charAt(emit.getEnd() + 1))); + } + + private boolean isPartialMatchWhiteSpaceSeparated(final CharSequence searchText, final PayloadEmit emit) { + final long size = searchText.length(); + return (emit.getStart() != 0 && !isWhitespace(searchText.charAt(emit.getStart() - 1))) + || (emit.getEnd() + 1 != size && !isWhitespace(searchText.charAt(emit.getEnd() + 1))); + } + + private PayloadState getState(PayloadState currentState, final Character character) { + PayloadState newCurrentState = currentState.nextState(character); + + while (newCurrentState == null) { + currentState = currentState.failure(); + newCurrentState = currentState.nextState(character); + } + + return newCurrentState; + } + + private void constructFailureStates() { + final Queue> queue = new LinkedBlockingDeque<>(); + final PayloadState startState = getRootState(); + + // First, set the fail state of all depth 1 states to the root state + for (PayloadState depthOneState : startState.getStates()) { + depthOneState.setFailure(startState); + queue.add(depthOneState); + } + + // Second, determine the fail state for all depth > 1 state + while (!queue.isEmpty()) { + final PayloadState currentState = queue.remove(); + + for (final Character transition : currentState.getTransitions()) { + PayloadState targetState = currentState.nextState(transition); + queue.add(targetState); + + PayloadState traceFailureState = currentState.failure(); + while (traceFailureState.nextState(transition) == null) { + traceFailureState = traceFailureState.failure(); + } + + final PayloadState newFailureState = traceFailureState.nextState(transition); + targetState.setFailure(newFailureState); + targetState.addEmit(newFailureState.emit()); + } + } + } + + private boolean processEmits(final CharSequence text, final int position, final Collection> payloads, final PayloadEmitHandler emitHandler) { + boolean emitted = false; + for (final Payload payload : payloads) { + final PayloadEmit payloadEmit = new PayloadEmit<>(position - payload.getKeyword().length() + 1, + position, payload.getKeyword(), payload.getData()); + if (!(trieConfig.isOnlyWholeWords() && isPartialMatch(text, payloadEmit)) && + !(trieConfig.isOnlyWholeWordsWhiteSpaceSeparated() && isPartialMatchWhiteSpaceSeparated(text, payloadEmit))) { + emitted = emitHandler.emit(payloadEmit) || emitted; + if (emitted && trieConfig.isStopOnHit()) { + break; + } + } + } + + return emitted; + } + + private boolean isCaseInsensitive() { + return trieConfig.isCaseInsensitive(); + } + + private PayloadState getRootState() { + return this.rootState; + } + + /** + * Provides a fluent interface for constructing Trie instances with payloads. + * @param The type of the emitted payload. + * + * @return The builder used to configure its Trie. + */ + public static PayloadTrieBuilder builder() { + return new PayloadTrieBuilder<>(); + } + + /** + * Builder class to create a PayloadTrie instance. + * + * @param The type of the emitted payload. + */ + public static class PayloadTrieBuilder { + + private final TrieConfig trieConfig = new TrieConfig(); + + private final PayloadTrie trie = new PayloadTrie<>(trieConfig); + + /** + * Default (empty) constructor. + */ + private PayloadTrieBuilder() { + } + + /** + * Configure the Trie to ignore case when searching for keywords in the text. + * This must be called before calling addKeyword because the algorithm converts + * keywords to lowercase as they are added, depending on this case sensitivity + * setting. + * + * @return This builder. + */ + public PayloadTrieBuilder ignoreCase() { + this.trieConfig.setCaseInsensitive(true); + return this; + } + + /** + * Configure the Trie to ignore overlapping keywords. + * + * @return This builder. + */ + public PayloadTrieBuilder ignoreOverlaps() { + this.trieConfig.setAllowOverlaps(false); + return this; + } + + /** + * Adds a keyword to the {@link Trie}'s list of text search keywords. + * No {@link Payload} is supplied. + * + * @param keyword The keyword to add to the list. + * @return This builder. + * @throws NullPointerException if the keyword is null. + */ + public PayloadTrieBuilder addKeyword(final String keyword) { + this.trie.addKeyword(keyword); + return this; + } + + /** + * Adds a keyword and a payload to the {@link Trie}'s list of text + * search keywords. + * + * @param keyword The keyword to add to the list. + * @param payload the payload to add + * @return This builder. + * @throws NullPointerException if the keyword is null. + */ + public PayloadTrieBuilder addKeyword(final String keyword, final T payload) { + this.trie.addKeyword(keyword, payload); + return this; + } + + /** + * Adds a list of keywords and payloads to the {@link Trie}'s list of + * text search keywords. + * + * @param keywords The keywords to add to the list. + * @return This builder. + */ + public PayloadTrieBuilder addKeywords(final Collection> keywords) { + for (Payload payload : keywords) { + this.trie.addKeyword(payload.getKeyword(), payload.getData()); + } + return this; + } + + /** + * Configure the Trie to match whole keywords in the text. + * + * @return This builder. + */ + public PayloadTrieBuilder onlyWholeWords() { + this.trieConfig.setOnlyWholeWords(true); + return this; + } + + /** + * Configure the Trie to match whole keywords that are separated by whitespace + * in the text. For example, "this keyword thatkeyword" would only match the + * first occurrence of "keyword". + * + * @return This builder. + */ + public PayloadTrieBuilder onlyWholeWordsWhiteSpaceSeparated() { + this.trieConfig.setOnlyWholeWordsWhiteSpaceSeparated(true); + return this; + } + + /** + * Configure the Trie to stop after the first keyword is found in the text. + * + * @return This builder. + */ + public PayloadTrieBuilder stopOnHit() { + trie.trieConfig.setStopOnHit(true); + return this; + } + + /** + * Configure the PayloadTrie based on the builder settings. + * + * @return The configured PayloadTrie. + */ + public PayloadTrie build() { + this.trie.constructFailureStates(); + return this.trie; + } + + /** + * @return This builder. + * @deprecated Use ignoreCase() + */ + @Deprecated + public PayloadTrieBuilder caseInsensitive() { + return ignoreCase(); + } + + /** + * @return This builder. + * @deprecated Use ignoreOverlaps() + */ + @Deprecated + public PayloadTrieBuilder removeOverlaps() { + return ignoreOverlaps(); + } + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/State.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/State.java new file mode 100644 index 00000000..817490c1 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/State.java @@ -0,0 +1,134 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +import java.util.*; + +/** + *

+ * A state has various important tasks it must attend to: + *

+ *
    + *
  • success; when a character points to another state, it must return that state
  • + *
  • failure; when a character has no matching state, the algorithm must be able to fall back on a + * state with less depth
  • + *
  • emits; when this state is passed and keywords have been matched, the matches must be + * 'emitted' so that they can be used later on.
  • + *
+ *

+ * The root state is special in the sense that it has no failure state; it cannot fail. If it 'fails' + * it will still parse the next character and start from the root node. This ensures that the algorithm + * always runs. All other states always have a fail state. + *

+ * + * @author Robert Bor + */ +public class State { + + /** + * effective the size of the keyword + */ + private final int depth; + + /** + * only used for the root state to refer to itself in case no matches have been found + */ + private final State rootState; + + /** + * referred to in the white paper as the 'goto' structure. From a state it is possible to go + * to other states, depending on the character passed. + */ + private final Map success = new HashMap<>(); + + /** + * if no matching states are found, the failure state will be returned + */ + private State failure; + + /** + * whenever this state is reached, it will emit the matches keywords for future reference + */ + private Set emits; + + public State() { + this(0); + } + + public State(final int depth) { + this.depth = depth; + this.rootState = depth == 0 ? this : null; + } + + private State nextState(final Character character, final boolean ignoreRootState) { + State nextState = this.success.get(character); + + if (!ignoreRootState && nextState == null && this.rootState != null) { + nextState = this.rootState; + } + + return nextState; + } + + public State nextState(final Character character) { + return nextState(character, false); + } + + public State nextStateIgnoreRootState(Character character) { + return nextState(character, true); + } + + public State addState(String keyword) { + State state = this; + + for (final Character character : keyword.toCharArray()) { + state = state.addState(character); + } + + return state; + } + + public State addState(Character character) { + State nextState = nextStateIgnoreRootState(character); + if (nextState == null) { + nextState = new State(this.depth + 1); + this.success.put(character, nextState); + } + return nextState; + } + + public int getDepth() { + return this.depth; + } + + public void addEmit(String keyword) { + if (this.emits == null) { + this.emits = new TreeSet<>(); + } + this.emits.add(keyword); + } + + public void addEmit(Collection emits) { + for (String emit : emits) { + addEmit(emit); + } + } + + public Collection emit() { + return this.emits == null ? Collections.emptyList() : this.emits; + } + + public State failure() { + return this.failure; + } + + public void setFailure(State failState) { + this.failure = failState; + } + + public Collection getStates() { + return this.success.values(); + } + + public Collection getTransitions() { + return this.success.keySet(); + } +} \ No newline at end of file diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Token.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Token.java new file mode 100644 index 00000000..731bd518 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Token.java @@ -0,0 +1,17 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +public abstract class Token { + private String fragment; + + public Token(String fragment) { + this.fragment = fragment; + } + + public String getFragment() { + return this.fragment; + } + + public abstract boolean isMatch(); + + public abstract Emit getEmit(); +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Trie.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Trie.java new file mode 100644 index 00000000..c9e8d3f6 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/Trie.java @@ -0,0 +1,228 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +import java.util.ArrayList; +import java.util.Collection; + +import com.ismartcoding.lib.ahocorasick.trie.PayloadTrie.PayloadTrieBuilder; +import com.ismartcoding.lib.ahocorasick.trie.handler.EmitHandler; +import com.ismartcoding.lib.ahocorasick.trie.handler.StatefulPayloadEmitDelegateHandler; +import com.ismartcoding.lib.ahocorasick.trie.handler.PayloadEmitDelegateHandler; +import com.ismartcoding.lib.ahocorasick.trie.handler.StatefulEmitHandler; + +/** + * Based on the Aho-Corasick white + * paper, from Bell technologies. + * + * @author Robert Bor + */ +public class Trie { + + private final PayloadTrie payloadTrie; + + private Trie(final PayloadTrie payloadTrie) { + this.payloadTrie = payloadTrie; + } + + public Collection tokenize(final String text) { + Collection> tokens = this.payloadTrie.tokenize(text); + return asTokens(tokens); + } + + private static Collection asTokens(Collection> tokens) { + Collection result = new ArrayList<>(); + for (PayloadToken payloadToken : tokens) { + result.add(new DefaultToken(payloadToken)); + } + return result; + } + + private static Collection asEmits(Collection> emits) { + Collection result = new ArrayList<>(); + for (PayloadEmit emit : emits) { + result.add(asEmit(emit)); + } + return result; + } + + private static Emit asEmit(PayloadEmit payloadEmit) { + return new Emit(payloadEmit.getStart(), payloadEmit.getEnd(), payloadEmit.getKeyword()); + } + + public Collection parseText(final CharSequence text) { + Collection> parsedText = this.payloadTrie.parseText(text); + return asEmits(parsedText); + } + + @SuppressWarnings("UnusedReturnValue") + public Collection parseText( final CharSequence text, final StatefulEmitHandler emitHandler) { + Collection> parsedText = this.payloadTrie.parseText(text, + new StatefulPayloadEmitDelegateHandler(emitHandler)); + return asEmits(parsedText); + } + + public boolean containsMatch(final CharSequence text) { + return firstMatch(text) != null; + } + + public void parseText(final CharSequence text, final EmitHandler emitHandler) { + this.payloadTrie.parseText(text, new PayloadEmitDelegateHandler(emitHandler)); + } + + /** + * The first matching text sequence. + * + * @param text The text to search for keywords, must not be {@code null}. + * @return {@code null} if no matches found. + */ + public Emit firstMatch(final CharSequence text) { + assert text != null; + + final PayloadEmit payload = this.payloadTrie.firstMatch( text ); + return payload == null + ? null + : new Emit( payload.getStart(), + payload.getEnd(), + payload.getKeyword() ); + } + + /** + * Provides a fluent interface for constructing Trie instances. + * + * @return The builder used to configure its Trie. + */ + public static TrieBuilder builder() { + return new TrieBuilder(); + } + + public static class TrieBuilder { + + private final PayloadTrieBuilder delegate = PayloadTrie.builder(); + + /** + * Default (empty) constructor. + */ + private TrieBuilder() { + } + + /** + * Configure the Trie to ignore case when searching for keywords in the text. + * This must be called before calling addKeyword because the algorithm converts + * keywords to lowercase as they are added, depending on this case sensitivity + * setting. + * + * @return This builder. + */ + public TrieBuilder ignoreCase() { + delegate.ignoreCase(); +// this.trieConfig.setCaseInsensitive(true); + return this; + } + + /** + * Configure the Trie to ignore overlapping keywords. + * + * @return This builder. + */ + public TrieBuilder ignoreOverlaps() { + delegate.ignoreOverlaps(); + return this; + } + + /** + * Adds a keyword to the Trie's list of text search keywords. + * + * @param keyword The keyword to add to the list. + * @return This builder. + * @throws NullPointerException if the keyword is null. + */ + public TrieBuilder addKeyword(final String keyword) { + delegate.addKeyword(keyword, null); + return this; + } + + /** + * Adds a list of keywords to the Trie's list of text search keywords. + * + * @param keywords The keywords to add to the list. + * @return This builder. + */ + public TrieBuilder addKeywords(final String... keywords) { + for (String keyword : keywords) { + delegate.addKeyword(keyword, null); + } + return this; + } + + /** + * Adds a list of keywords to the Trie's list of text search keywords. + * + * @param keywords The keywords to add to the list. + * @return This builder. + */ + @SuppressWarnings("unused") + public TrieBuilder addKeywords( final Collection keywords ) { + for (String keyword : keywords) { + this.delegate.addKeyword(keyword, null); + } + return this; + } + + /** + * Configure the Trie to match whole keywords in the text. + * + * @return This builder. + */ + public TrieBuilder onlyWholeWords() { + this.delegate.onlyWholeWords(); + return this; + } + + /** + * Configure the Trie to match whole keywords that are separated by whitespace + * in the text. For example, "this keyword thatkeyword" would only match the + * first occurrence of "keyword". + * + * @return This builder. + */ + public TrieBuilder onlyWholeWordsWhiteSpaceSeparated() { + this.delegate.onlyWholeWordsWhiteSpaceSeparated(); + return this; + } + + /** + * Configure the Trie to stop after the first keyword is found in the text. + * + * @return This builder. + */ + public TrieBuilder stopOnHit() { + this.delegate.stopOnHit(); + return this; + } + + /** + * Configure the Trie based on the builder settings. + * + * @return The configured Trie. + */ + public Trie build() { + PayloadTrie payloadTrie = this.delegate.build(); + return new Trie(payloadTrie); + } + + /** + * @return This builder. + * @deprecated Use ignoreCase() + */ + public TrieBuilder caseInsensitive() { + return ignoreCase(); + } + + /** + * @return This builder. + * @deprecated Use ignoreOverlaps() + */ + public TrieBuilder removeOverlaps() { + return ignoreOverlaps(); + } + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/TrieConfig.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/TrieConfig.java new file mode 100644 index 00000000..0025e462 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/TrieConfig.java @@ -0,0 +1,54 @@ +package com.ismartcoding.lib.ahocorasick.trie; + +public class TrieConfig { + + private boolean allowOverlaps = true; + + private boolean onlyWholeWords = false; + + private boolean onlyWholeWordsWhiteSpaceSeparated = false; + + private boolean caseInsensitive = false; + + private boolean stopOnHit = false; + + public boolean isStopOnHit() { + return stopOnHit; + } + + public void setStopOnHit(boolean stopOnHit) { + this.stopOnHit = stopOnHit; + } + + public boolean isAllowOverlaps() { + return allowOverlaps; + } + + public void setAllowOverlaps(boolean allowOverlaps) { + this.allowOverlaps = allowOverlaps; + } + + public boolean isOnlyWholeWords() { + return onlyWholeWords; + } + + public void setOnlyWholeWords(boolean onlyWholeWords) { + this.onlyWholeWords = onlyWholeWords; + } + + public boolean isOnlyWholeWordsWhiteSpaceSeparated() { + return onlyWholeWordsWhiteSpaceSeparated; + } + + public void setOnlyWholeWordsWhiteSpaceSeparated(boolean onlyWholeWordsWhiteSpaceSeparated) { + this.onlyWholeWordsWhiteSpaceSeparated = onlyWholeWordsWhiteSpaceSeparated; + } + + public boolean isCaseInsensitive() { + return caseInsensitive; + } + + public void setCaseInsensitive(boolean caseInsensitive) { + this.caseInsensitive = caseInsensitive; + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/AbstractStatefulEmitHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/AbstractStatefulEmitHandler.java new file mode 100644 index 00000000..e83b3d87 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/AbstractStatefulEmitHandler.java @@ -0,0 +1,21 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import com.ismartcoding.lib.ahocorasick.trie.Emit; + +import java.util.ArrayList; +import java.util.List; + +public abstract class AbstractStatefulEmitHandler implements StatefulEmitHandler { + + private final List emits = new ArrayList(); + + public void addEmit(final Emit emit) { + this.emits.add(emit); + } + + @Override + public List getEmits() { + return this.emits; + } + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java new file mode 100644 index 00000000..04f5e703 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/AbstractStatefulPayloadEmitHandler.java @@ -0,0 +1,21 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import java.util.ArrayList; +import java.util.List; + +import com.ismartcoding.lib.ahocorasick.trie.PayloadEmit; + +public abstract class AbstractStatefulPayloadEmitHandler implements StatefulPayloadEmitHandler { + + private final List> emits = new ArrayList<>(); + + public void addEmit(final PayloadEmit emit) { + this.emits.add(emit); + } + + @Override + public List> getEmits() { + return this.emits; + } + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/DefaultEmitHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/DefaultEmitHandler.java new file mode 100644 index 00000000..01071bfb --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/DefaultEmitHandler.java @@ -0,0 +1,22 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import java.util.ArrayList; +import java.util.List; + +import com.ismartcoding.lib.ahocorasick.trie.Emit; + +public class DefaultEmitHandler implements StatefulEmitHandler { + + private final List emits = new ArrayList<>(); + + @Override + public boolean emit(final Emit emit) { + this.emits.add(emit); + return true; + } + + @Override + public List getEmits() { + return this.emits; + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java new file mode 100644 index 00000000..3e3bc76d --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/DefaultPayloadEmitHandler.java @@ -0,0 +1,22 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import java.util.ArrayList; +import java.util.List; + +import com.ismartcoding.lib.ahocorasick.trie.PayloadEmit; + +public class DefaultPayloadEmitHandler implements StatefulPayloadEmitHandler { + + private final List> emits = new ArrayList<>(); + + @Override + public boolean emit(final PayloadEmit emit) { + this.emits.add(emit); + return true; + } + + @Override + public List> getEmits() { + return this.emits; + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/EmitHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/EmitHandler.java new file mode 100644 index 00000000..f370203b --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/EmitHandler.java @@ -0,0 +1,7 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import com.ismartcoding.lib.ahocorasick.trie.Emit; + +public interface EmitHandler { + boolean emit(Emit emit); +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java new file mode 100644 index 00000000..4350ca70 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/PayloadEmitDelegateHandler.java @@ -0,0 +1,25 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import com.ismartcoding.lib.ahocorasick.trie.Emit; +import com.ismartcoding.lib.ahocorasick.trie.PayloadEmit; + +/** + * Convenience wrapper class that delegates every method to an + * instance of {@link EmitHandler}. + */ +public class PayloadEmitDelegateHandler implements PayloadEmitHandler { + + private EmitHandler handler; + + public PayloadEmitDelegateHandler(EmitHandler handler) { + this.handler = handler; + + } + + @Override + public boolean emit(PayloadEmit emit) { + Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); + return handler.emit(newEmit); + } + +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/PayloadEmitHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/PayloadEmitHandler.java new file mode 100644 index 00000000..f929786c --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/PayloadEmitHandler.java @@ -0,0 +1,7 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import com.ismartcoding.lib.ahocorasick.trie.PayloadEmit; + +public interface PayloadEmitHandler { + boolean emit(PayloadEmit emit); +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/StatefulEmitHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/StatefulEmitHandler.java new file mode 100644 index 00000000..444b7f9e --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/StatefulEmitHandler.java @@ -0,0 +1,8 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import java.util.List; +import com.ismartcoding.lib.ahocorasick.trie.Emit; + +public interface StatefulEmitHandler extends EmitHandler { + List getEmits(); +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java new file mode 100644 index 00000000..b546c025 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/StatefulPayloadEmitDelegateHandler.java @@ -0,0 +1,42 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import com.ismartcoding.lib.ahocorasick.trie.Emit; +import com.ismartcoding.lib.ahocorasick.trie.PayloadEmit; + +/** + * Convenience wrapper class that delegates every method to a + * {@link StatefulPayloadEmitHandler}. + */ +public class StatefulPayloadEmitDelegateHandler implements StatefulPayloadEmitHandler { + + private StatefulEmitHandler handler; + + public StatefulPayloadEmitDelegateHandler(StatefulEmitHandler handler) { + this.handler = handler; + + } + + private static List> asEmits(Collection emits) { + List> result = new ArrayList<>(); + for (Emit emit : emits) { + result.add(new PayloadEmit(emit.getStart(), emit.getEnd(), emit.getKeyword(), null)); + } + return result; + } + + @Override + public boolean emit(PayloadEmit emit) { + Emit newEmit = new Emit(emit.getStart(), emit.getEnd(), emit.getKeyword()); + return handler.emit(newEmit); + } + + @Override + public List> getEmits() { + List emits = this.handler.getEmits(); + return asEmits(emits); + } +} diff --git a/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java new file mode 100644 index 00000000..c3f650f8 --- /dev/null +++ b/lib/src/main/java/com/ismartcoding/lib/ahocorasick/trie/handler/StatefulPayloadEmitHandler.java @@ -0,0 +1,9 @@ +package com.ismartcoding.lib.ahocorasick.trie.handler; + +import com.ismartcoding.lib.ahocorasick.trie.PayloadEmit; + +import java.util.List; + +public interface StatefulPayloadEmitHandler extends PayloadEmitHandler{ + List> getEmits(); +} diff --git a/lib/src/main/java/com/ismartcoding/lib/pinyin/Engine.kt b/lib/src/main/java/com/ismartcoding/lib/pinyin/Engine.kt index 0caea045..ac97c9ea 100644 --- a/lib/src/main/java/com/ismartcoding/lib/pinyin/Engine.kt +++ b/lib/src/main/java/com/ismartcoding/lib/pinyin/Engine.kt @@ -1,9 +1,9 @@ package com.ismartcoding.lib.pinyin +import com.ismartcoding.lib.ahocorasick.trie.Emit +import com.ismartcoding.lib.ahocorasick.trie.Trie import com.ismartcoding.lib.pinyin.Pinyin.toPinyin import com.ismartcoding.lib.pinyin.Utils.dictsToTrie -import org.ahocorasick.trie.Emit -import org.ahocorasick.trie.Trie import java.util.* /** diff --git a/lib/src/main/java/com/ismartcoding/lib/pinyin/ForwardLongestSelector.kt b/lib/src/main/java/com/ismartcoding/lib/pinyin/ForwardLongestSelector.kt index 0ade524c..4d243248 100644 --- a/lib/src/main/java/com/ismartcoding/lib/pinyin/ForwardLongestSelector.kt +++ b/lib/src/main/java/com/ismartcoding/lib/pinyin/ForwardLongestSelector.kt @@ -1,7 +1,7 @@ package com.ismartcoding.lib.pinyin +import com.ismartcoding.lib.ahocorasick.trie.Emit import com.ismartcoding.lib.pinyin.Engine.EmitComparator -import org.ahocorasick.trie.Emit import java.util.* /** diff --git a/lib/src/main/java/com/ismartcoding/lib/pinyin/Pinyin.kt b/lib/src/main/java/com/ismartcoding/lib/pinyin/Pinyin.kt index e643e348..52981979 100644 --- a/lib/src/main/java/com/ismartcoding/lib/pinyin/Pinyin.kt +++ b/lib/src/main/java/com/ismartcoding/lib/pinyin/Pinyin.kt @@ -1,7 +1,7 @@ package com.ismartcoding.lib.pinyin +import com.ismartcoding.lib.ahocorasick.trie.Trie import com.ismartcoding.lib.pinyin.Utils.dictsToTrie -import org.ahocorasick.trie.Trie import java.util.* import kotlin.experimental.and import kotlin.experimental.or diff --git a/lib/src/main/java/com/ismartcoding/lib/pinyin/SegmentationSelector.kt b/lib/src/main/java/com/ismartcoding/lib/pinyin/SegmentationSelector.kt index df2308e3..7a882d68 100644 --- a/lib/src/main/java/com/ismartcoding/lib/pinyin/SegmentationSelector.kt +++ b/lib/src/main/java/com/ismartcoding/lib/pinyin/SegmentationSelector.kt @@ -1,6 +1,6 @@ package com.ismartcoding.lib.pinyin -import org.ahocorasick.trie.Emit +import com.ismartcoding.lib.ahocorasick.trie.Emit /** * 分词选择算法应实现的接口 diff --git a/lib/src/main/java/com/ismartcoding/lib/pinyin/Utils.kt b/lib/src/main/java/com/ismartcoding/lib/pinyin/Utils.kt index b9abd71b..b89a6610 100644 --- a/lib/src/main/java/com/ismartcoding/lib/pinyin/Utils.kt +++ b/lib/src/main/java/com/ismartcoding/lib/pinyin/Utils.kt @@ -1,6 +1,6 @@ package com.ismartcoding.lib.pinyin -import org.ahocorasick.trie.Trie +import com.ismartcoding.lib.ahocorasick.trie.Trie import java.util.* internal object Utils {