Skip to content

Commit

Permalink
v0.4.0: String queries inlcuding fuzzy search based on tries (#5)
Browse files Browse the repository at this point in the history
# v0.4.0 🚀
- Prefix-Trie & Suffix-Trie implementations
- Prefix index based on the Prefix-Trie
  - StartsWith-Queries for fast prefix matching
  - Fast fuzzy matching is supported as well
- FullText index based on the Suffix-Trie
  - Contains-Queries for fast infix matching
  - Fast fuzzy matching is supported as well to support (complex) search scenarios
  - Also supports StartsWith queries
- Updated Readme and Unit-Tests
  • Loading branch information
akade authored May 25, 2022
1 parent f089305 commit 5c2319f
Show file tree
Hide file tree
Showing 28 changed files with 1,266 additions and 47 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ jobs:

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v1
uses: github/codeql-action/init@v2
with:
languages: ${{ matrix.language }}
# If you wish to specify custom queries, you can do so here or in a config file.
Expand All @@ -59,4 +59,4 @@ jobs:
run: dotnet build

- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v1
uses: github/codeql-action/analyze@v2
6 changes: 3 additions & 3 deletions Akade.IndexedSet.Tests/Akade.IndexedSet.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.1.0" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.9" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.9" />
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.2.0" />
<PackageReference Include="MSTest.TestAdapter" Version="2.2.10" />
<PackageReference Include="MSTest.TestFramework" Version="2.2.10" />
<PackageReference Include="coverlet.collector" Version="3.1.2">
<PrivateAssets>all</PrivateAssets>
<IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
Expand Down
131 changes: 131 additions & 0 deletions Akade.IndexedSet.Tests/DataStructures/SuffixTrieTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
using Akade.IndexedSet.DataStructures;
using Microsoft.VisualStudio.TestTools.UnitTesting;

namespace Akade.IndexedSet.Tests.DataStructures;

[TestClass]
public class SuffixTrieTests
{
private SuffixTrie<string> _trie = default!;

[TestInitialize]
public void TestInitializer()
{
_trie = new();
_ = AddToStringTrie(_trie, "Tiger");
_ = AddToStringTrie(_trie, "Tarantula");
_ = AddToStringTrie(_trie, "Penguin");
_ = AddToStringTrie(_trie, "Panther");
_ = AddToStringTrie(_trie, "Pangolin");
_ = AddToStringTrie(_trie, "Parrot");
_ = AddToStringTrie(_trie, "Chihuahua");
}

[TestMethod]
public void querying_common_prefixes_return_correct_elements()
{
CollectionAssert.AreEquivalent(new string[] { "Tiger", "Tarantula" }, _trie.GetAll("T").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Penguin", "Panther", "Pangolin", "Parrot" }, _trie.GetAll("P").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin", "Parrot" }, _trie.GetAll("Pa").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin" }, _trie.GetAll("Pan").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Panther" }, _trie.GetAll("Pant").ToArray());
}

[TestMethod]
public void querying_common_infixes_return_correct_elements()
{
CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin", "Tarantula" }, _trie.GetAll("an").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Chihuahua" }, _trie.GetAll("hua").ToArray());
}

[TestMethod]
public void querying_common_suffixes_return_correct_elements()
{
CollectionAssert.AreEquivalent(new string[] { "Panther", "Tiger" }, _trie.GetAll("er").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Chihuahua" }, _trie.GetAll("hua").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Penguin", "Pangolin" }, _trie.GetAll("in").ToArray());
}

[TestMethod]
public void adding_the_same_element_return_false()
{
SuffixTrie<string> trie = new();

Assert.IsTrue(AddToStringTrie(trie, "Cat"));
Assert.IsFalse(AddToStringTrie(trie, "Cat"));
}

[TestMethod]
public void contains_returns_correct_value_when_adding_elements()
{
SuffixTrie<string> trie = new();

Assert.IsFalse(ContainsInStringTrie(trie, "Cat"));
Assert.IsTrue(AddToStringTrie(trie, "Cat"));
Assert.IsTrue(ContainsInStringTrie(trie, "Cat"));
}

[TestMethod]
public void contains_returns_correct_value_when_removing_elements()
{
SuffixTrie<string> trie = new();

_ = AddToStringTrie(trie, "Cat");
Assert.IsTrue(ContainsInStringTrie(trie, "Cat"));
Assert.IsTrue(RemoveFromStringTrie(trie, "Cat"));
Assert.IsFalse(ContainsInStringTrie(trie, "Cat"));
}

[TestMethod]
public void removing_returns_false_if_the_element_is_not_present()
{
SuffixTrie<string> trie = new();
Assert.IsFalse(RemoveFromStringTrie(trie, "Cat"));
}

[TestMethod]
public void exact_fuzzy_search_with_single_result()
{
IEnumerable<string> result = _trie.FuzzySearch("rantul", 1, true);
Assert.AreEqual("Tarantula", result.Single());
}

[TestMethod]
public void exact_fuzzy_search_without_results()
{
IEnumerable<string> result = _trie.FuzzySearch("Panner", 1, true);
Assert.IsFalse(result.Any());
}

[TestMethod]

public void inexact_fuzzy_search_and_multiple_result()
{
IEnumerable<string> result = _trie.FuzzySearch("Pan", 2, false);
CollectionAssert.AreEquivalent(new[] { "Penguin", "Panther", "Pangolin", "Parrot", "Tarantula", "Chihuahua" }, result.ToArray());
}

[TestMethod]
public void inexact_fuzzy_search_without_result()
{
IEnumerable<string> result = _trie.FuzzySearch("Non", 1, false);
Assert.IsFalse(result.Any());
}


// https://murilo.wordpress.com/2011/02/01/fast-and-easy-levenshtein-distance-using-a-trie-in-c/
private static bool AddToStringTrie(SuffixTrie<string> stringTrie, string value)
{
return stringTrie.Add(value, value);
}

private static bool RemoveFromStringTrie(SuffixTrie<string> stringTrie, string value)
{
return stringTrie.Remove(value, value);
}

private static bool ContainsInStringTrie(SuffixTrie<string> stringTrie, string value)
{
return stringTrie.Contains(value, value);
}
}
122 changes: 122 additions & 0 deletions Akade.IndexedSet.Tests/DataStructures/TrieTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
using Akade.IndexedSet.DataStructures;
using Microsoft.VisualStudio.TestTools.UnitTesting;

namespace Akade.IndexedSet.Tests.DataStructures;

[TestClass]
public class TrieTests
{
private static Trie<string> GetAnimalTrie()
{
Trie<string> trie = new();

_ = AddToStringTrie(trie, "Tiger");
_ = AddToStringTrie(trie, "Tarantula");
_ = AddToStringTrie(trie, "Penguin");
_ = AddToStringTrie(trie, "Panther");
_ = AddToStringTrie(trie, "Pangolin");
_ = AddToStringTrie(trie, "Parrot");
return trie;
}

[TestMethod]
public void querying_common_prefixes_return_correct_elements()
{
Trie<string> trie = GetAnimalTrie();

CollectionAssert.AreEquivalent(new string[] { "Tiger", "Tarantula" }, trie.GetAll("T").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Penguin", "Panther", "Pangolin", "Parrot" }, trie.GetAll("P").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin", "Parrot" }, trie.GetAll("Pa").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin" }, trie.GetAll("Pan").ToArray());
CollectionAssert.AreEquivalent(new string[] { "Panther" }, trie.GetAll("Pant").ToArray());
}

[TestMethod]
public void adding_the_same_element_returns_false()
{
Trie<string> trie = new();

Assert.IsTrue(AddToStringTrie(trie, "Cat"));
Assert.IsFalse(AddToStringTrie(trie, "Cat"));
}

[TestMethod]
public void contains_returns_correct_value_when_adding_elements()
{
Trie<string> trie = new();

Assert.IsFalse(ContainsInStringTrie(trie, "Cat"));
Assert.IsTrue(AddToStringTrie(trie, "Cat"));
Assert.IsTrue(ContainsInStringTrie(trie, "Cat"));
}

[TestMethod]
public void contains_returns_correct_value_when_removing_elements()
{
Trie<string> trie = new();
_ = AddToStringTrie(trie, "Cat");

Assert.IsTrue(ContainsInStringTrie(trie, "Cat"));
Assert.IsTrue(RemoveFromStringTrie(trie, "Cat"));
Assert.IsFalse(ContainsInStringTrie(trie, "Cat"));
}

[TestMethod]
public void removing_returns_false_if_the_element_is_not_present()
{
Trie<string> trie = new();
Assert.IsFalse(RemoveFromStringTrie(trie, "Cat"));
}

[TestMethod]
public void exact_fuzzy_search_with_single_result()
{
Trie<string> trie = GetAnimalTrie();

IEnumerable<string> result = trie.FuzzySearch("Panter", 1, true);
Assert.AreEqual("Panther", result.Single());
}

[TestMethod]
public void exact_fuzzy_search_without_results()
{
Trie<string> trie = GetAnimalTrie();

IEnumerable<string> result = trie.FuzzySearch("Panner", 1, true);
Assert.IsFalse(result.Any());
}

[TestMethod]
public void inexact_fuzzy_search_and_multiple_result()
{
Trie<string> trie = GetAnimalTrie();

IEnumerable<string> result = trie.FuzzySearch("Pan", 2, false);

CollectionAssert.AreEquivalent(new[] { "Penguin", "Panther", "Pangolin", "Parrot" }, result.ToArray());
}

[TestMethod]
public void inexact_fuzzy_search_without_result()
{
Trie<string> trie = GetAnimalTrie();

IEnumerable<string> result = trie.FuzzySearch("Non", 1, false);
Assert.IsFalse(result.Any());
}

private static bool AddToStringTrie(Trie<string> stringTrie, string value)
{
return stringTrie.Add(value, value);
}

private static bool RemoveFromStringTrie(Trie<string> stringTrie, string value)
{
return stringTrie.Remove(value, value);
}

private static bool ContainsInStringTrie(Trie<string> stringTrie, string value)
{
return stringTrie.Contains(value, value);
}
}
96 changes: 96 additions & 0 deletions Akade.IndexedSet.Tests/FullTextIndices.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
using Akade.IndexedSet.Tests.TestUtilities;
using Microsoft.VisualStudio.TestTools.UnitTesting;

namespace Akade.IndexedSet.Tests;

[TestClass]
public class FullTextIndices
{
private record class Animal(string Name, string Category);

private IndexedSet<Animal> _indexedSet = null!;
private readonly Animal _bonobo = new("Bonobo", "Mammal");
private readonly Animal _booby = new("Booby", "Bird");
private readonly Animal _boomslang = new("Boomslang", "Reptile");
private readonly Animal _borador = new("Borador", "Mammal");
private readonly Animal _tiger = new("Tiger", "Mammal");
private readonly Animal _tarantula = new("Tarantula", "Spider");
private readonly Animal _tapir = new("Tapir", "Mammal");
private readonly Animal _penguin = new("Penguin", "Bird");
private readonly Animal _panther = new("Panther", "Mammal");
private readonly Animal _pangolin = new("Pangolin", "Mammal");
private readonly Animal _parrot = new("Parrot", "Bird");


[TestInitialize]
public void Init()
{
var data = new Animal[] {
_bonobo,
_booby,
_boomslang,
_borador,
_tiger,
_tarantula,
_tapir,
_penguin,
_panther,
_pangolin,
_parrot,
};
_indexedSet = data.ToIndexedSet()
.WithFullTextIndex(x => x.Category.AsMemory())
.WithFullTextIndex(x => x.Name.AsMemory())
.Build();
}

[TestMethod]
public void single_item_retrieval_works()
{
_indexedSet.AssertSingleItem(x => x.Category.AsMemory(), _boomslang);
_indexedSet.AssertSingleItem(x => x.Category.AsMemory(), _tarantula);
}

[TestMethod]
[ExpectedException(typeof(InvalidOperationException))]
public void single_item_retrieval_throws_exception_if_there_is_more_than_one_result()
{
_indexedSet.AssertSingleItem(x => x.Category.AsMemory(), _bonobo);
}

[TestMethod]
public void multi_item_retrieval_works()
{
_indexedSet.AssertMultipleItems(x => x.Category.AsMemory(), expectedElements: new[] { _bonobo, _borador, _tiger, _tapir, _panther, _pangolin });
_indexedSet.AssertMultipleItems(x => x.Category.AsMemory(), expectedElements: new[] { _booby, _penguin, _parrot });
}

[TestMethod]
public void search_via_starts_with()
{
CollectionAssert.AreEquivalent(new[] { _booby, _boomslang }, _indexedSet.StartsWith(x => x.Name.AsMemory(), "Boo".AsMemory()).ToArray());
CollectionAssert.AreEquivalent(new[] { _panther, _pangolin }, _indexedSet.StartsWith(x => x.Name.AsMemory(), "Pan".AsMemory()).ToArray());
}


[TestMethod]
public void search_via_fuzzy_starts_with()
{
CollectionAssert.AreEquivalent(new[] { _bonobo, _booby, _boomslang, _borador }, _indexedSet.FuzzyStartsWith(x => x.Name.AsMemory(), "Boo".AsMemory(), 1).ToArray());
CollectionAssert.AreEquivalent(new[] { _penguin, _parrot, _panther, _pangolin }, _indexedSet.FuzzyStartsWith(x => x.Name.AsMemory(), "Pan".AsMemory(), 1).ToArray());
}

[TestMethod]
public void search_via_contains()
{
CollectionAssert.AreEquivalent(new[] { _boomslang, _tarantula, _panther, _pangolin }, _indexedSet.Contains(x => x.Name.AsMemory(), "an".AsMemory()).ToArray());
}


[TestMethod]
public void search_via_fuzzy_contains()
{
Animal[] actual = _indexedSet.FuzzyContains(x => x.Name.AsMemory(), "Pan".AsMemory(), 1).ToArray();
CollectionAssert.AreEquivalent(new[] { _boomslang, _tarantula, _penguin, _parrot, _panther, _pangolin }, actual);
}
}
Loading

0 comments on commit 5c2319f

Please sign in to comment.