diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index ba37f65..395c462 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -42,7 +42,7 @@ jobs: # Initializes the CodeQL tools for scanning. - name: Initialize CodeQL - uses: github/codeql-action/init@v1 + uses: github/codeql-action/init@v2 with: languages: ${{ matrix.language }} # If you wish to specify custom queries, you can do so here or in a config file. @@ -59,4 +59,4 @@ jobs: run: dotnet build - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v1 + uses: github/codeql-action/analyze@v2 diff --git a/Akade.IndexedSet.Tests/Akade.IndexedSet.Tests.csproj b/Akade.IndexedSet.Tests/Akade.IndexedSet.Tests.csproj index 2274058..549e36c 100644 --- a/Akade.IndexedSet.Tests/Akade.IndexedSet.Tests.csproj +++ b/Akade.IndexedSet.Tests/Akade.IndexedSet.Tests.csproj @@ -8,9 +8,9 @@ - - - + + + all runtime; build; native; contentfiles; analyzers; buildtransitive diff --git a/Akade.IndexedSet.Tests/DataStructures/SuffixTrieTests.cs b/Akade.IndexedSet.Tests/DataStructures/SuffixTrieTests.cs new file mode 100644 index 0000000..e4fd6b4 --- /dev/null +++ b/Akade.IndexedSet.Tests/DataStructures/SuffixTrieTests.cs @@ -0,0 +1,131 @@ +using Akade.IndexedSet.DataStructures; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Akade.IndexedSet.Tests.DataStructures; + +[TestClass] +public class SuffixTrieTests +{ + private SuffixTrie _trie = default!; + + [TestInitialize] + public void TestInitializer() + { + _trie = new(); + _ = AddToStringTrie(_trie, "Tiger"); + _ = AddToStringTrie(_trie, "Tarantula"); + _ = AddToStringTrie(_trie, "Penguin"); + _ = AddToStringTrie(_trie, "Panther"); + _ = AddToStringTrie(_trie, "Pangolin"); + _ = AddToStringTrie(_trie, "Parrot"); + _ = AddToStringTrie(_trie, "Chihuahua"); + } + + [TestMethod] + public void querying_common_prefixes_return_correct_elements() + { + CollectionAssert.AreEquivalent(new string[] { "Tiger", "Tarantula" }, _trie.GetAll("T").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Penguin", "Panther", "Pangolin", "Parrot" }, _trie.GetAll("P").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin", "Parrot" }, _trie.GetAll("Pa").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin" }, _trie.GetAll("Pan").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Panther" }, _trie.GetAll("Pant").ToArray()); + } + + [TestMethod] + public void querying_common_infixes_return_correct_elements() + { + CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin", "Tarantula" }, _trie.GetAll("an").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Chihuahua" }, _trie.GetAll("hua").ToArray()); + } + + [TestMethod] + public void querying_common_suffixes_return_correct_elements() + { + CollectionAssert.AreEquivalent(new string[] { "Panther", "Tiger" }, _trie.GetAll("er").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Chihuahua" }, _trie.GetAll("hua").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Penguin", "Pangolin" }, _trie.GetAll("in").ToArray()); + } + + [TestMethod] + public void adding_the_same_element_return_false() + { + SuffixTrie trie = new(); + + Assert.IsTrue(AddToStringTrie(trie, "Cat")); + Assert.IsFalse(AddToStringTrie(trie, "Cat")); + } + + [TestMethod] + public void contains_returns_correct_value_when_adding_elements() + { + SuffixTrie trie = new(); + + Assert.IsFalse(ContainsInStringTrie(trie, "Cat")); + Assert.IsTrue(AddToStringTrie(trie, "Cat")); + Assert.IsTrue(ContainsInStringTrie(trie, "Cat")); + } + + [TestMethod] + public void contains_returns_correct_value_when_removing_elements() + { + SuffixTrie trie = new(); + + _ = AddToStringTrie(trie, "Cat"); + Assert.IsTrue(ContainsInStringTrie(trie, "Cat")); + Assert.IsTrue(RemoveFromStringTrie(trie, "Cat")); + Assert.IsFalse(ContainsInStringTrie(trie, "Cat")); + } + + [TestMethod] + public void removing_returns_false_if_the_element_is_not_present() + { + SuffixTrie trie = new(); + Assert.IsFalse(RemoveFromStringTrie(trie, "Cat")); + } + + [TestMethod] + public void exact_fuzzy_search_with_single_result() + { + IEnumerable result = _trie.FuzzySearch("rantul", 1, true); + Assert.AreEqual("Tarantula", result.Single()); + } + + [TestMethod] + public void exact_fuzzy_search_without_results() + { + IEnumerable result = _trie.FuzzySearch("Panner", 1, true); + Assert.IsFalse(result.Any()); + } + + [TestMethod] + + public void inexact_fuzzy_search_and_multiple_result() + { + IEnumerable result = _trie.FuzzySearch("Pan", 2, false); + CollectionAssert.AreEquivalent(new[] { "Penguin", "Panther", "Pangolin", "Parrot", "Tarantula", "Chihuahua" }, result.ToArray()); + } + + [TestMethod] + public void inexact_fuzzy_search_without_result() + { + IEnumerable result = _trie.FuzzySearch("Non", 1, false); + Assert.IsFalse(result.Any()); + } + + + // https://murilo.wordpress.com/2011/02/01/fast-and-easy-levenshtein-distance-using-a-trie-in-c/ + private static bool AddToStringTrie(SuffixTrie stringTrie, string value) + { + return stringTrie.Add(value, value); + } + + private static bool RemoveFromStringTrie(SuffixTrie stringTrie, string value) + { + return stringTrie.Remove(value, value); + } + + private static bool ContainsInStringTrie(SuffixTrie stringTrie, string value) + { + return stringTrie.Contains(value, value); + } +} diff --git a/Akade.IndexedSet.Tests/DataStructures/TrieTests.cs b/Akade.IndexedSet.Tests/DataStructures/TrieTests.cs new file mode 100644 index 0000000..97ada86 --- /dev/null +++ b/Akade.IndexedSet.Tests/DataStructures/TrieTests.cs @@ -0,0 +1,122 @@ +using Akade.IndexedSet.DataStructures; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Akade.IndexedSet.Tests.DataStructures; + +[TestClass] +public class TrieTests +{ + private static Trie GetAnimalTrie() + { + Trie trie = new(); + + _ = AddToStringTrie(trie, "Tiger"); + _ = AddToStringTrie(trie, "Tarantula"); + _ = AddToStringTrie(trie, "Penguin"); + _ = AddToStringTrie(trie, "Panther"); + _ = AddToStringTrie(trie, "Pangolin"); + _ = AddToStringTrie(trie, "Parrot"); + return trie; + } + + [TestMethod] + public void querying_common_prefixes_return_correct_elements() + { + Trie trie = GetAnimalTrie(); + + CollectionAssert.AreEquivalent(new string[] { "Tiger", "Tarantula" }, trie.GetAll("T").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Penguin", "Panther", "Pangolin", "Parrot" }, trie.GetAll("P").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin", "Parrot" }, trie.GetAll("Pa").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Panther", "Pangolin" }, trie.GetAll("Pan").ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Panther" }, trie.GetAll("Pant").ToArray()); + } + + [TestMethod] + public void adding_the_same_element_returns_false() + { + Trie trie = new(); + + Assert.IsTrue(AddToStringTrie(trie, "Cat")); + Assert.IsFalse(AddToStringTrie(trie, "Cat")); + } + + [TestMethod] + public void contains_returns_correct_value_when_adding_elements() + { + Trie trie = new(); + + Assert.IsFalse(ContainsInStringTrie(trie, "Cat")); + Assert.IsTrue(AddToStringTrie(trie, "Cat")); + Assert.IsTrue(ContainsInStringTrie(trie, "Cat")); + } + + [TestMethod] + public void contains_returns_correct_value_when_removing_elements() + { + Trie trie = new(); + _ = AddToStringTrie(trie, "Cat"); + + Assert.IsTrue(ContainsInStringTrie(trie, "Cat")); + Assert.IsTrue(RemoveFromStringTrie(trie, "Cat")); + Assert.IsFalse(ContainsInStringTrie(trie, "Cat")); + } + + [TestMethod] + public void removing_returns_false_if_the_element_is_not_present() + { + Trie trie = new(); + Assert.IsFalse(RemoveFromStringTrie(trie, "Cat")); + } + + [TestMethod] + public void exact_fuzzy_search_with_single_result() + { + Trie trie = GetAnimalTrie(); + + IEnumerable result = trie.FuzzySearch("Panter", 1, true); + Assert.AreEqual("Panther", result.Single()); + } + + [TestMethod] + public void exact_fuzzy_search_without_results() + { + Trie trie = GetAnimalTrie(); + + IEnumerable result = trie.FuzzySearch("Panner", 1, true); + Assert.IsFalse(result.Any()); + } + + [TestMethod] + public void inexact_fuzzy_search_and_multiple_result() + { + Trie trie = GetAnimalTrie(); + + IEnumerable result = trie.FuzzySearch("Pan", 2, false); + + CollectionAssert.AreEquivalent(new[] { "Penguin", "Panther", "Pangolin", "Parrot" }, result.ToArray()); + } + + [TestMethod] + public void inexact_fuzzy_search_without_result() + { + Trie trie = GetAnimalTrie(); + + IEnumerable result = trie.FuzzySearch("Non", 1, false); + Assert.IsFalse(result.Any()); + } + + private static bool AddToStringTrie(Trie stringTrie, string value) + { + return stringTrie.Add(value, value); + } + + private static bool RemoveFromStringTrie(Trie stringTrie, string value) + { + return stringTrie.Remove(value, value); + } + + private static bool ContainsInStringTrie(Trie stringTrie, string value) + { + return stringTrie.Contains(value, value); + } +} diff --git a/Akade.IndexedSet.Tests/FullTextIndices.cs b/Akade.IndexedSet.Tests/FullTextIndices.cs new file mode 100644 index 0000000..39ed44d --- /dev/null +++ b/Akade.IndexedSet.Tests/FullTextIndices.cs @@ -0,0 +1,96 @@ +using Akade.IndexedSet.Tests.TestUtilities; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Akade.IndexedSet.Tests; + +[TestClass] +public class FullTextIndices +{ + private record class Animal(string Name, string Category); + + private IndexedSet _indexedSet = null!; + private readonly Animal _bonobo = new("Bonobo", "Mammal"); + private readonly Animal _booby = new("Booby", "Bird"); + private readonly Animal _boomslang = new("Boomslang", "Reptile"); + private readonly Animal _borador = new("Borador", "Mammal"); + private readonly Animal _tiger = new("Tiger", "Mammal"); + private readonly Animal _tarantula = new("Tarantula", "Spider"); + private readonly Animal _tapir = new("Tapir", "Mammal"); + private readonly Animal _penguin = new("Penguin", "Bird"); + private readonly Animal _panther = new("Panther", "Mammal"); + private readonly Animal _pangolin = new("Pangolin", "Mammal"); + private readonly Animal _parrot = new("Parrot", "Bird"); + + + [TestInitialize] + public void Init() + { + var data = new Animal[] { + _bonobo, + _booby, + _boomslang, + _borador, + _tiger, + _tarantula, + _tapir, + _penguin, + _panther, + _pangolin, + _parrot, + }; + _indexedSet = data.ToIndexedSet() + .WithFullTextIndex(x => x.Category.AsMemory()) + .WithFullTextIndex(x => x.Name.AsMemory()) + .Build(); + } + + [TestMethod] + public void single_item_retrieval_works() + { + _indexedSet.AssertSingleItem(x => x.Category.AsMemory(), _boomslang); + _indexedSet.AssertSingleItem(x => x.Category.AsMemory(), _tarantula); + } + + [TestMethod] + [ExpectedException(typeof(InvalidOperationException))] + public void single_item_retrieval_throws_exception_if_there_is_more_than_one_result() + { + _indexedSet.AssertSingleItem(x => x.Category.AsMemory(), _bonobo); + } + + [TestMethod] + public void multi_item_retrieval_works() + { + _indexedSet.AssertMultipleItems(x => x.Category.AsMemory(), expectedElements: new[] { _bonobo, _borador, _tiger, _tapir, _panther, _pangolin }); + _indexedSet.AssertMultipleItems(x => x.Category.AsMemory(), expectedElements: new[] { _booby, _penguin, _parrot }); + } + + [TestMethod] + public void search_via_starts_with() + { + CollectionAssert.AreEquivalent(new[] { _booby, _boomslang }, _indexedSet.StartsWith(x => x.Name.AsMemory(), "Boo".AsMemory()).ToArray()); + CollectionAssert.AreEquivalent(new[] { _panther, _pangolin }, _indexedSet.StartsWith(x => x.Name.AsMemory(), "Pan".AsMemory()).ToArray()); + } + + + [TestMethod] + public void search_via_fuzzy_starts_with() + { + CollectionAssert.AreEquivalent(new[] { _bonobo, _booby, _boomslang, _borador }, _indexedSet.FuzzyStartsWith(x => x.Name.AsMemory(), "Boo".AsMemory(), 1).ToArray()); + CollectionAssert.AreEquivalent(new[] { _penguin, _parrot, _panther, _pangolin }, _indexedSet.FuzzyStartsWith(x => x.Name.AsMemory(), "Pan".AsMemory(), 1).ToArray()); + } + + [TestMethod] + public void search_via_contains() + { + CollectionAssert.AreEquivalent(new[] { _boomslang, _tarantula, _panther, _pangolin }, _indexedSet.Contains(x => x.Name.AsMemory(), "an".AsMemory()).ToArray()); + } + + + [TestMethod] + public void search_via_fuzzy_contains() + { + Animal[] actual = _indexedSet.FuzzyContains(x => x.Name.AsMemory(), "Pan".AsMemory(), 1).ToArray(); + CollectionAssert.AreEquivalent(new[] { _boomslang, _tarantula, _penguin, _parrot, _panther, _pangolin }, actual); + } +} diff --git a/Akade.IndexedSet.Tests/NonUniqueIndices.cs b/Akade.IndexedSet.Tests/NonUniqueIndices.cs index ae112fe..d2cdf4d 100644 --- a/Akade.IndexedSet.Tests/NonUniqueIndices.cs +++ b/Akade.IndexedSet.Tests/NonUniqueIndices.cs @@ -1,5 +1,5 @@ using Akade.IndexedSet.Tests.Data; -using Akade.IndexedSet.Tests.Utilities; +using Akade.IndexedSet.Tests.TestUtilities; using Microsoft.VisualStudio.TestTools.UnitTesting; namespace Akade.IndexedSet.Tests; @@ -8,16 +8,16 @@ namespace Akade.IndexedSet.Tests; public class NonUniqueIndices { private IndexedSet _indexedSet = null!; - private readonly TestData _a = new(0, 10, GuidGen.Get(1), "A"); - private readonly TestData _b = new(1, 10, GuidGen.Get(2), "B"); - private readonly TestData _c = new(2, 11, GuidGen.Get(3), "C"); - private readonly TestData _d = new(3, 12, GuidGen.Get(4), "C"); - private readonly TestData _e = new(4, 12, GuidGen.Get(4), "C"); + private readonly TestData _a = new(0, 10, GuidGen.Get(1), "AA"); + private readonly TestData _b = new(1, 10, GuidGen.Get(2), "BB"); + private readonly TestData _c = new(2, 11, GuidGen.Get(3), "CC"); + private readonly TestData _d = new(3, 12, GuidGen.Get(4), "CC"); + private readonly TestData _e = new(4, 12, GuidGen.Get(4), "CC"); [TestInitialize] public void Init() { - TestData[] data = new[] {_a, _b, _c, _d, _e}; + TestData[] data = new[] { _a, _b, _c, _d, _e }; _indexedSet = data.ToIndexedSet(x => x.PrimaryKey) .WithIndex(x => x.IntProperty) .WithIndex(x => x.GuidProperty) @@ -60,7 +60,7 @@ public void range_queries_throw_exception() [TestMethod] public void retrieval_via_compound_key_returns_correct_items() { - TestData[] data = new[] {_a, _b, _c, _d, _e}; + TestData[] data = new[] { _a, _b, _c, _d, _e }; _indexedSet = data.ToIndexedSet(x => x.PrimaryKey) .WithIndex(x => (x.IntProperty, x.StringProperty)) .Build(); @@ -78,4 +78,11 @@ public void Removal() Assert.IsFalse(_indexedSet.Remove(0)); Assert.IsFalse(_indexedSet.Contains(0)); } + + [TestMethod] + public void string_query_selects_the_correct_where_overload() + { + _indexedSet.AssertMultipleItems(x => x.StringProperty, expectedElements: new[] { _c, _d, _e }); + Assert.AreEqual(_a, _indexedSet.Where(x => x.StringProperty, "AA").Single()); + } } diff --git a/Akade.IndexedSet.Tests/PrefixIndices.cs b/Akade.IndexedSet.Tests/PrefixIndices.cs new file mode 100644 index 0000000..b18e488 --- /dev/null +++ b/Akade.IndexedSet.Tests/PrefixIndices.cs @@ -0,0 +1,82 @@ +using Akade.IndexedSet.Tests.TestUtilities; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Akade.IndexedSet.Tests; + +[TestClass] +public class PrefixIndices +{ + private record class Animal(string Name, string Category); + + private IndexedSet _indexedSet = null!; + private readonly Animal _bonobo = new("Bonobo", "Mammal"); + private readonly Animal _booby = new("Booby", "Bird"); + private readonly Animal _boomslang = new("Boomslang", "Reptile"); + private readonly Animal _borador = new("Borador", "Mammal"); + private readonly Animal _tiger = new("Tiger", "Mammal"); + private readonly Animal _tarantula = new("Tarantula", "Spider"); + private readonly Animal _tapir = new("Tapir", "Mammal"); + private readonly Animal _penguin = new("Penguin", "Bird"); + private readonly Animal _panther = new("Panther", "Mammal"); + private readonly Animal _pangolin = new("Pangolin", "Mammal"); + private readonly Animal _parrot = new("Parrot", "Bird"); + + + [TestInitialize] + public void Init() + { + var data = new Animal[] { + _bonobo, + _booby, + _boomslang, + _borador, + _tiger, + _tarantula, + _tapir, + _penguin, + _panther, + _pangolin, + _parrot, + }; + _indexedSet = data.ToIndexedSet() + .WithPrefixIndex(x => x.Category.AsMemory()) + .WithPrefixIndex(x => x.Name.AsMemory()) + .Build(); + } + + [TestMethod] + public void single_item_retrieval_works() + { + _indexedSet.AssertSingleItem(x => x.Category.AsMemory(), _boomslang); + _indexedSet.AssertSingleItem(x => x.Category.AsMemory(), _tarantula); + } + + [TestMethod] + [ExpectedException(typeof(InvalidOperationException))] + public void single_item_retrieval_throws_exception_if_there_is_more_than_one_result() + { + _indexedSet.AssertSingleItem(x => x.Category.AsMemory(), _bonobo); + } + + [TestMethod] + public void multi_item_retrieval_works() + { + _indexedSet.AssertMultipleItems(x => x.Category.AsMemory(), expectedElements: new[] { _bonobo, _borador, _tiger, _tapir, _panther, _pangolin }); + _indexedSet.AssertMultipleItems(x => x.Category.AsMemory(), expectedElements: new[] { _booby, _penguin, _parrot }); + } + + [TestMethod] + public void search_via_starts_with() + { + CollectionAssert.AreEquivalent(new[] { _booby, _boomslang }, _indexedSet.StartsWith(x => x.Name.AsMemory(), "Boo".AsMemory()).ToArray()); + CollectionAssert.AreEquivalent(new[] { _panther, _pangolin }, _indexedSet.StartsWith(x => x.Name.AsMemory(), "Pan".AsMemory()).ToArray()); + } + + + [TestMethod] + public void search_via_fuzzy_starts_with() + { + CollectionAssert.AreEquivalent(new[] { _bonobo, _booby, _boomslang, _borador }, _indexedSet.FuzzyStartsWith(x => x.Name.AsMemory(), "Boo".AsMemory(), 1).ToArray()); + CollectionAssert.AreEquivalent(new[] { _penguin, _parrot, _panther, _pangolin }, _indexedSet.FuzzyStartsWith(x => x.Name.AsMemory(), "Pan".AsMemory(), 1).ToArray()); + } +} diff --git a/Akade.IndexedSet.Tests/RangeIndices.cs b/Akade.IndexedSet.Tests/RangeIndices.cs index 932d76c..f0f650f 100644 --- a/Akade.IndexedSet.Tests/RangeIndices.cs +++ b/Akade.IndexedSet.Tests/RangeIndices.cs @@ -1,5 +1,5 @@ using Akade.IndexedSet.Tests.Data; -using Akade.IndexedSet.Tests.Utilities; +using Akade.IndexedSet.Tests.TestUtilities; using Microsoft.VisualStudio.TestTools.UnitTesting; namespace Akade.IndexedSet.Tests; diff --git a/Akade.IndexedSet.Tests/Samples/Appointments/AppointmentSample.cs b/Akade.IndexedSet.Tests/Samples/Appointments/AppointmentSample.cs index 47b88fc..d42f33e 100644 --- a/Akade.IndexedSet.Tests/Samples/Appointments/AppointmentSample.cs +++ b/Akade.IndexedSet.Tests/Samples/Appointments/AppointmentSample.cs @@ -1,4 +1,4 @@ -using Akade.IndexedSet.Tests.Utilities; +using Akade.IndexedSet.Tests.TestUtilities; using Microsoft.VisualStudio.TestTools.UnitTesting; namespace Akade.IndexedSet.Tests.Samples.Appointments; @@ -19,6 +19,7 @@ private static TimeSpan Duration(Appointment appointment) .WithRangeIndex(x => x.Start) .WithRangeIndex(x => x.End) .WithRangeIndex(Duration) // calculated property + .WithFullTextIndex(x => x.Subject.AsMemory()) .Build(); public AppointmentSample() @@ -108,4 +109,22 @@ public void using_a_calculated_index_property_to_find_all_longer_appointments() CollectionAssert.AreEqual(new[] { "Iteration Planning - 420", "Weekly - 75", "Discuss Issue #1234 - 45", "Discuss Technical Debt #42 - 45" }, longerAppointments); } + + [TestMethod] + public void text_searching_within_subjects() + { + // querying within the full-text-search index allows to perform a contains over a trie instead of comparing it on all elements + Appointment meetingWith42InSubject = _appointments.Contains(x => x.Subject.AsMemory(), "#42".AsMemory()).Single(); + Assert.IsTrue(meetingWith42InSubject.Subject.Contains("#42")); + } + + [TestMethod] + public void fuzzy_searching_within_subjects() + { + // Fulltext and prefix indices support fuzzy matching to allow a certain number of errors (Levenshtein/edit distance) + Appointment technicalDebtMeeting = _appointments.FuzzyContains(x => x.Subject.AsMemory(), "Technical Det".AsMemory(), 1).Single(); + + Assert.IsFalse(technicalDebtMeeting.Subject.Contains("Technical Det")); + Assert.IsTrue(technicalDebtMeeting.Subject.Contains("Technical Debt")); + } } diff --git a/Akade.IndexedSet.Tests/Samples/Leaderboard/LeaderboardSample.cs b/Akade.IndexedSet.Tests/Samples/Leaderboard/LeaderboardSample.cs index 13f7856..7bd4579 100644 --- a/Akade.IndexedSet.Tests/Samples/Leaderboard/LeaderboardSample.cs +++ b/Akade.IndexedSet.Tests/Samples/Leaderboard/LeaderboardSample.cs @@ -5,7 +5,6 @@ namespace Akade.IndexedSet.Tests.Samples.Leaderboard; [TestClass] public class LeaderboardSample { - private readonly IndexedSet _leaderboard = IndexedSetBuilder.Create(x => x.Id) .WithRangeIndex(x => x.Score) .WithRangeIndex(x => x.Timestamp) diff --git a/Akade.IndexedSet.Tests/Samples/Readme.cs b/Akade.IndexedSet.Tests/Samples/Readme.cs index a80e5fa..873844b 100644 --- a/Akade.IndexedSet.Tests/Samples/Readme.cs +++ b/Akade.IndexedSet.Tests/Samples/Readme.cs @@ -103,6 +103,24 @@ public void Features_ComputedOrCompoundKey() result = set.Where(ComputedKey.SomeStaticMethod, 42); } + [TestMethod] + public void Features_StringQueries() + { + IndexedSet data = typeof(object).Assembly.GetTypes() + .ToIndexedSet() + .WithPrefixIndex(x => x.Name.AsMemory()) + .WithFullTextIndex(x => x.FullName.AsMemory()) + .Build(); + + // fast prefix or contains queries via indices + _ = data.StartsWith(x => x.Name.AsMemory(), "Int".AsMemory()); + _ = data.Contains(x => x.FullName.AsMemory(), "Int".AsMemory()); + + // fuzzy searching is supported by prefix and full text indices + // the following will also match "String" + _ = data.FuzzyStartsWith(x => x.Name.AsMemory(), "Strang".AsMemory(), 1); + _ = data.FuzzyContains(x => x.FullName.AsMemory(), "Strang".AsMemory(), 1); + } [TestMethod] public void FAQ_MultipleIndicesForSameProperty() @@ -146,6 +164,9 @@ public void Overview() _ = set.Where(x => (x.ProductId, x.UnitPrice), (4, 10)); } + + + private static class DataIndices { public static int UniqueIndex(Data x) diff --git a/Akade.IndexedSet.Tests/Samples/TypeaheadSample/TypeaheadSample.cs b/Akade.IndexedSet.Tests/Samples/TypeaheadSample/TypeaheadSample.cs new file mode 100644 index 0000000..9e83023 --- /dev/null +++ b/Akade.IndexedSet.Tests/Samples/TypeaheadSample/TypeaheadSample.cs @@ -0,0 +1,27 @@ +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Akade.IndexedSet.Tests.Samples.TypeaheadSample; + +[TestClass] +public class TypeaheadSample +{ + private readonly IndexedSet _types; + + public TypeaheadSample() + { + _types = typeof(string).Assembly.GetTypes() + .ToIndexedSet() + .WithPrefixIndex(x => x.Name.ToLowerInvariant().AsMemory()) + .Build(); + } + + [TestMethod] + public void Case_insensitve_lookahead_in_all_types_within_system_runtime() + { + // Travers the prefix trie to efficiently find all matches + Type[] types = _types.StartsWith(x => x.Name.ToLowerInvariant().AsMemory(), "int".AsMemory()).ToArray(); + + Assert.IsTrue(types.Any()); + Assert.IsTrue(types.All(t => t.Name.StartsWith("int", StringComparison.InvariantCultureIgnoreCase))); + } +} diff --git a/Akade.IndexedSet.Tests/Utilities/GuidGen.cs b/Akade.IndexedSet.Tests/TestUtilities/GuidGen.cs similarity index 72% rename from Akade.IndexedSet.Tests/Utilities/GuidGen.cs rename to Akade.IndexedSet.Tests/TestUtilities/GuidGen.cs index f53d10a..f0d9ee3 100644 --- a/Akade.IndexedSet.Tests/Utilities/GuidGen.cs +++ b/Akade.IndexedSet.Tests/TestUtilities/GuidGen.cs @@ -1,4 +1,4 @@ -namespace Akade.IndexedSet.Tests.Utilities; +namespace Akade.IndexedSet.Tests.TestUtilities; internal class GuidGen { diff --git a/Akade.IndexedSet.Tests/Utilities/IEnumerableExtensions.cs b/Akade.IndexedSet.Tests/TestUtilities/IEnumerableExtensions.cs similarity index 87% rename from Akade.IndexedSet.Tests/Utilities/IEnumerableExtensions.cs rename to Akade.IndexedSet.Tests/TestUtilities/IEnumerableExtensions.cs index 3d12d82..9352af5 100644 --- a/Akade.IndexedSet.Tests/Utilities/IEnumerableExtensions.cs +++ b/Akade.IndexedSet.Tests/TestUtilities/IEnumerableExtensions.cs @@ -1,4 +1,4 @@ -namespace Akade.IndexedSet.Tests.Utilities; +namespace Akade.IndexedSet.Tests.TestUtilities; internal static class IEnumerableExtensions { diff --git a/Akade.IndexedSet.Tests/Utilities/IndexAssert.cs b/Akade.IndexedSet.Tests/TestUtilities/IndexAssert.cs similarity index 71% rename from Akade.IndexedSet.Tests/Utilities/IndexAssert.cs rename to Akade.IndexedSet.Tests/TestUtilities/IndexAssert.cs index 1556609..9f6747b 100644 --- a/Akade.IndexedSet.Tests/Utilities/IndexAssert.cs +++ b/Akade.IndexedSet.Tests/TestUtilities/IndexAssert.cs @@ -1,12 +1,11 @@ using Microsoft.VisualStudio.TestTools.UnitTesting; using System.Runtime.CompilerServices; -namespace Akade.IndexedSet.Tests.Utilities; +namespace Akade.IndexedSet.Tests.TestUtilities; internal static class IndexAssert { - public static void AssertSingleItem(this IndexedSet indexedSet, Func indexAccessor, TElement testData, [CallerArgumentExpression("indexAccessor")] string? indexName = null) - where TPrimaryKey : notnull + public static void AssertSingleItem(this IndexedSet indexedSet, Func indexAccessor, TElement testData, [CallerArgumentExpression("indexAccessor")] string? indexName = null) where TIndexKey : notnull { Assert.IsNotNull(indexName); @@ -19,8 +18,7 @@ public static void AssertSingleItem(this Index Assert.AreEqual(testData, accessViaWhere); } - public static void AssertMultipleItems(this IndexedSet indexedSet, Func indexAccessor, [CallerArgumentExpression("indexAccessor")] string? indexName = null, bool requireOrder = false, params TElement[] expectedElements) - where TPrimaryKey : notnull + public static void AssertMultipleItems(this IndexedSet indexedSet, Func indexAccessor, [CallerArgumentExpression("indexAccessor")] string? indexName = null, bool requireOrder = false, params TElement[] expectedElements) where TIndexKey : notnull { if (expectedElements.Length < 2) @@ -53,8 +51,8 @@ public static void AssertMultipleItems(this In } } - public static void AssertMultipleItemsViaRange - (this IndexedSet indexedSet, + public static void AssertMultipleItemsViaRange + (this IndexedSet indexedSet, Func indexAccessor, TIndexKey start, TIndexKey end, @@ -62,7 +60,6 @@ public static void AssertMultipleItemsViaRange bool inclusiveEnd, [CallerArgumentExpression("indexAccessor")] string? indexName = null, params TElement[] expectedElements) - where TPrimaryKey : notnull where TIndexKey : notnull { Assert.IsNotNull(indexName); diff --git a/Akade.IndexedSet.Tests/UniqueIndices.cs b/Akade.IndexedSet.Tests/UniqueIndices.cs index f177616..410512f 100644 --- a/Akade.IndexedSet.Tests/UniqueIndices.cs +++ b/Akade.IndexedSet.Tests/UniqueIndices.cs @@ -1,5 +1,5 @@ using Akade.IndexedSet.Tests.Data; -using Akade.IndexedSet.Tests.Utilities; +using Akade.IndexedSet.Tests.TestUtilities; using Microsoft.VisualStudio.TestTools.UnitTesting; namespace Akade.IndexedSet.Tests; diff --git a/Akade.IndexedSet.Tests/Utils/LevensteinDistanceTests.cs b/Akade.IndexedSet.Tests/Utils/LevensteinDistanceTests.cs new file mode 100644 index 0000000..df246f7 --- /dev/null +++ b/Akade.IndexedSet.Tests/Utils/LevensteinDistanceTests.cs @@ -0,0 +1,39 @@ +using Akade.IndexedSet.Utils; +using Microsoft.VisualStudio.TestTools.UnitTesting; + +namespace Akade.IndexedSet.Tests.Utils; + +[TestClass] +public class LevensteinDistanceTests +{ + [TestMethod] + public void distance_zero_should_perform_normal_match() + { + Assert.IsTrue(LevenshteinDistance.FuzzyMatch("Test", "Test", 0)); + Assert.IsFalse(LevenshteinDistance.FuzzyMatch("Test", "Best", 0)); + } + + [TestMethod] + public void distance_one_should_only_match_a_single_change_or_deletion() + { + Assert.IsTrue(LevenshteinDistance.FuzzyMatch("Test", "Test", 0)); + + Assert.IsTrue(LevenshteinDistance.FuzzyMatch("Test", "Best", 1)); + Assert.IsTrue(LevenshteinDistance.FuzzyMatch("Test", "est", 1)); + + Assert.IsFalse(LevenshteinDistance.FuzzyMatch("Test", "st", 1)); + Assert.IsFalse(LevenshteinDistance.FuzzyMatch("Test", "Bast", 1)); + } + + [TestMethod] + public void distance_two_should_match_two_changes_or_deletions() + { + Assert.IsTrue(LevenshteinDistance.FuzzyMatch("Test", "Test", 0)); + + Assert.IsTrue(LevenshteinDistance.FuzzyMatch("Test", "Best", 2)); + Assert.IsTrue(LevenshteinDistance.FuzzyMatch("Test", "est", 2)); + + Assert.IsTrue(LevenshteinDistance.FuzzyMatch("Test", "st", 2)); + Assert.IsTrue(LevenshteinDistance.FuzzyMatch("Test", "Bast", 2)); + } +} diff --git a/Akade.IndexedSet/Akade.IndexedSet.csproj b/Akade.IndexedSet/Akade.IndexedSet.csproj index 6e2c71f..f0f9f52 100644 --- a/Akade.IndexedSet/Akade.IndexedSet.csproj +++ b/Akade.IndexedSet/Akade.IndexedSet.csproj @@ -1,12 +1,11 @@  - net6.0 enable enable True True - 0.3.0 + 0.4.0 beta Provides an In-Memory data structure, the IndexedSet, that allows to easily add indices to allow efficient querying. Currently supports unique and non-unique indices as well as range indices for single attribute, compound or computed keys. Copyright © Akade 2022 @@ -14,11 +13,11 @@ true https://github.com/akade/Akade.IndexedSet MIT - data structure;indices;range query;unique index;range index;non-unique index;multi-value;dictionary;collections;computed keys + data structure;indices;range query;unique index;range index;non-unique index;multi-value;dictionary;collections;computed keys;fuzzy;fuzzy search;approximate string matching;search /README.md - + <_Parameter1>Akade.IndexedSet.Tests diff --git a/Akade.IndexedSet/DataStructures/SuffixTrie.cs b/Akade.IndexedSet/DataStructures/SuffixTrie.cs new file mode 100644 index 0000000..2a2ab6c --- /dev/null +++ b/Akade.IndexedSet/DataStructures/SuffixTrie.cs @@ -0,0 +1,56 @@ +namespace Akade.IndexedSet.DataStructures; +internal class SuffixTrie +{ + private readonly Trie _trie = new(); + + public bool Add(ReadOnlySpan key, TElement element) + { + if (_trie.Contains(key, element)) + { + return false; + } + + for (int i = 0; i < key.Length; i++) + { + _ = _trie.Add(key[i..key.Length], element); + } + + return true; + } + + public bool Contains(ReadOnlySpan key, TElement element) + { + return _trie.Contains(key, element); + } + + public bool Remove(ReadOnlySpan key, TElement element) + { + if (!_trie.Contains(key, element)) + { + return false; + } + + for (int i = 0; i < key.Length; i++) + { + _ = _trie.Remove(key[i..key.Length], element); + } + + return true; + } + + public IEnumerable Get(ReadOnlySpan key) + { + return _trie.Get(key); + } + + + public IEnumerable GetAll(ReadOnlySpan key) + { + return _trie.GetAll(key).Distinct(); + } + + internal IEnumerable FuzzySearch(ReadOnlySpan key, int maxDistance, bool exactMatches) + { + return _trie.FuzzySearch(key, maxDistance, exactMatches).Distinct(); + } +} diff --git a/Akade.IndexedSet/DataStructures/Trie.cs b/Akade.IndexedSet/DataStructures/Trie.cs new file mode 100644 index 0000000..6ae4c65 --- /dev/null +++ b/Akade.IndexedSet/DataStructures/Trie.cs @@ -0,0 +1,222 @@ +using Akade.IndexedSet.Extensions; +using System.Diagnostics.CodeAnalysis; + +namespace Akade.IndexedSet.DataStructures; + +internal class Trie +{ + private readonly TrieNode _root = new(); + + public bool Add(ReadOnlySpan key, TElement element) + { + return _root.Add(key, element); + } + + public bool Remove(ReadOnlySpan key, TElement element) + { + return _root.Remove(key, element); + } + + public IEnumerable Get(ReadOnlySpan key) + { + TrieNode? matchingNode = _root.Find(key); + return matchingNode is null ? Enumerable.Empty() : matchingNode.GetLocalElements(); + } + + public bool Contains(ReadOnlySpan key, TElement element) + { + return _root.Contains(key, element); + } + + + public IEnumerable GetAll(ReadOnlySpan prefix) + { + TrieNode? matchingNode = _root.Find(prefix); + + if (matchingNode is null) + { + return Enumerable.Empty(); + } + + IEnumerable result = matchingNode.GetLocalElements(); + + foreach (TrieNode node in matchingNode.GetAllChildren()) + { + result = result.Concat(node.GetLocalElements()); + } + + return result; + } + + + public IEnumerable FuzzySearch(ReadOnlySpan word, int maxDistance, bool exactMatches) + { + int wordLength = word.Length; + + int[] currentRow = new int[wordLength + 1]; + + for (int i = 0; i < currentRow.Length; i++) + { + currentRow[i] = i; + } + + PriorityQueue results = new(); + + + for (int i = 0; i < wordLength; i++) + { + if (_root.TryGetChild(word[i], out TrieNode? startNode)) + { + FuzzySearchInternal(startNode, word[i], currentRow, word, results, maxDistance, exactMatches); + } + } + + return exactMatches + ? results.DequeueAsIEnumerable().SelectMany(node => node.GetLocalElements()) + : results.DequeueAsIEnumerable().SelectMany(node => node.GetAllChildren().SelectMany(n => n.GetLocalElements())); + } + + private void FuzzySearchInternal(TrieNode currentNode, char ch, int[] lastRow, ReadOnlySpan word, PriorityQueue results, int maxDistance, bool exploreSubTrees) + { + int[] currentRow = new int[lastRow.Length]; + currentRow[0] = lastRow[0] + 1; + + int minDistance = currentRow[0]; + + for (int i = 1; i < currentRow.Length; i++) + { + int insertOrDeletion = Math.Min(currentRow[i - 1] + 1, lastRow[i] + 1); + int replacement = word[i - 1] == ch ? lastRow[i - 1] : lastRow[i - 1] + 1; + currentRow[i] = Math.Min(insertOrDeletion, replacement); + minDistance = Math.Min(minDistance, currentRow[i]); + } + + bool found = false; + + if (currentRow[^1] <= maxDistance) + { + results.Enqueue(currentNode, currentRow[^1]); + found = true; + } + + if (!exploreSubTrees && found) + { + return; + } + + if (minDistance <= maxDistance) + { + foreach (KeyValuePair child in currentNode.GetLocalChildren()) + { + FuzzySearchInternal(child.Value, child.Key, currentRow, word, results, maxDistance, exploreSubTrees); + } + } + } + + private class TrieNode + { + private SortedDictionary? _children; + private HashSet? _elements; + + internal bool Add(ReadOnlySpan key, TElement element) + { + if (key.IsEmpty) + { + _elements ??= new HashSet(); + return _elements.Add(element); + } + else + { + _children ??= new(); + if (!_children.TryGetValue(key[0], out TrieNode? trieNode)) + { + _children[key[0]] = trieNode = new(); + } + + return trieNode.Add(key[1..], element); + } + } + + internal TrieNode? Find(ReadOnlySpan key) + { + if (key.IsEmpty) + { + return this; + } + else if (_children?.TryGetValue(key[0], out TrieNode? trieNode) ?? false) + { + return trieNode.Find(key[1..]); + } + + return null; + } + + internal IEnumerable GetAllChildren() + { + if (_children is null) + { + yield break; + } + + foreach (TrieNode node in _children.Values) + { + yield return node; + + foreach (TrieNode child in node.GetAllChildren()) + { + yield return child; + } + } + } + + internal IEnumerable> GetLocalChildren() + { + return _children ?? Enumerable.Empty>(); + } + + + internal IEnumerable GetLocalElements() + { + return _elements ?? Enumerable.Empty(); + } + + internal bool Remove(ReadOnlySpan key, TElement element) + { + if (key.IsEmpty) + { + return _elements?.Remove(element) ?? false; + } + else if (_children?.TryGetValue(key[0], out TrieNode? trieNode) ?? false) + { + return trieNode.Remove(key[1..], element); + } + + return false; + } + + internal bool Contains(ReadOnlySpan key, TElement element) + { + if (key.IsEmpty) + { + return _elements is not null && _elements.Contains(element); + } + else if (_children?.TryGetValue(key[0], out TrieNode? trieNode) ?? false) + { + return trieNode.Contains(key[1..], element); + } + + return false; + } + + internal bool TryGetChild(char key, [NotNullWhen(true)] out TrieNode? node) + { + node = default; + return _children?.TryGetValue(key, out node) ?? false; + } + + internal bool HasElements() + { + return _elements is not null && _elements.Count > 0; + } + } +} diff --git a/Akade.IndexedSet/Extensions/IEnumerableExtensions.cs b/Akade.IndexedSet/Extensions/IEnumerableExtensions.cs new file mode 100644 index 0000000..5ec0751 --- /dev/null +++ b/Akade.IndexedSet/Extensions/IEnumerableExtensions.cs @@ -0,0 +1,11 @@ +namespace Akade.IndexedSet.Extensions; +internal static class IEnumerableExtensions +{ + public static IEnumerable DequeueAsIEnumerable(this PriorityQueue queue) + { + while (queue.TryDequeue(out TElement? element, out _)) + { + yield return element; + } + } +} diff --git a/Akade.IndexedSet/IndexedSet.cs b/Akade.IndexedSet/IndexedSet.cs index 20d50f9..bdea380 100644 --- a/Akade.IndexedSet/IndexedSet.cs +++ b/Akade.IndexedSet/IndexedSet.cs @@ -382,6 +382,64 @@ public IEnumerable OrderByDescending(Func + /// Returns all elements that start with the given char sequence + /// + /// Accessor for the indexed property. The expression as a string is used as an identifier for the index. Hence, the convention is to always use x as an identifier. + /// Is passed to using . + /// The prefix to use + /// The name of the index. Usually, you should not specify this as the expression in is automatically passed by the compiler. + [SuppressMessage("Style", "IDE0060:Remove unused parameter", Justification = "Used as caller argument expression")] + public IEnumerable StartsWith(Func> indexAccessor, ReadOnlyMemory prefix, [CallerArgumentExpression("indexAccessor")] string? indexName = null) + { + TypedIndex> typedIndex = GetIndex>(indexName); + return typedIndex.StartsWith(prefix); + } + + /// + /// Returns all elements that start with the given char sequence or a similar one. + /// + /// Accessor for the indexed property. The expression as a string is used as an identifier for the index. Hence, the convention is to always use x as an identifier. + /// Is passed to using . + /// The prefix to use + /// The maximum distance (e.g. Levenshtein) between the input prefix and matches + /// The name of the index. Usually, you should not specify this as the expression in is automatically passed by the compiler. + [SuppressMessage("Style", "IDE0060:Remove unused parameter", Justification = "Used as caller argument expression")] + public IEnumerable FuzzyStartsWith(Func> indexAccessor, ReadOnlyMemory prefix, int maxDistance, [CallerArgumentExpression("indexAccessor")] string? indexName = null) + { + TypedIndex> typedIndex = GetIndex>(indexName); + return typedIndex.FuzzyStartsWith(prefix, maxDistance); + } + + /// + /// Returns all elements that contain the given char sequence + /// + /// Accessor for the indexed property. The expression as a string is used as an identifier for the index. Hence, the convention is to always use x as an identifier. + /// Is passed to using . + /// The infix to use + /// The name of the index. Usually, you should not specify this as the expression in is automatically passed by the compiler. + [SuppressMessage("Style", "IDE0060:Remove unused parameter", Justification = "Used as caller argument expression")] + public IEnumerable Contains(Func> indexAccessor, ReadOnlyMemory infix, [CallerArgumentExpression("indexAccessor")] string? indexName = null) + { + TypedIndex> typedIndex = GetIndex>(indexName); + return typedIndex.Contains(infix); + } + + /// + /// Returns all elements that contain the given char sequence or a simalar one. + /// + /// Accessor for the indexed property. The expression as a string is used as an identifier for the index. Hence, the convention is to always use x as an identifier. + /// Is passed to using . + /// The infix to use + /// The maximum distance (e.g. Levenshtein) between the input infix and matches + /// The name of the index. Usually, you should not specify this as the expression in is automatically passed by the compiler. + [SuppressMessage("Style", "IDE0060:Remove unused parameter", Justification = "Used as caller argument expression")] + public IEnumerable FuzzyContains(Func> indexAccessor, ReadOnlyMemory infix, int maxDistance, [CallerArgumentExpression("indexAccessor")] string? indexName = null) + { + TypedIndex> typedIndex = GetIndex>(indexName); + return typedIndex.FuzzyContains(infix, maxDistance); + } + /// /// Returns all values by fully enumerating the entire set. /// diff --git a/Akade.IndexedSet/IndexedSetBuilder.cs b/Akade.IndexedSet/IndexedSetBuilder.cs index c720da5..63f4f5d 100644 --- a/Akade.IndexedSet/IndexedSetBuilder.cs +++ b/Akade.IndexedSet/IndexedSetBuilder.cs @@ -185,6 +185,52 @@ public virtual IndexedSetBuilder WithRangeIndex(Func + /// Configures the to have a full text index based on a secondary key that + /// supports fuzzy search on string startswith/contains queries. The secondary key can be any expression that does not change while + /// the element is within the indexed set. The name of the index is based on the + /// string representation of the expression and passed by the compiler to . + /// The convention is to always use x as a lambda parameter: x => x.StringProp1. Alternativly, you can also always use the same method from a static class. + /// + /// Accessor for the indexed property. The expression as a string is used as an identifier for the index. + /// Hence, the convention is to always use x as an identifier in case a lambda expression is used. + /// The name of the index. Usually, you should not specify this as the expression in is automatically passed by the compiler. + /// The instance on which this method is called is returned to support the fluent syntax. + public virtual IndexedSetBuilder WithFullTextIndex(Func> keyAccessor, [CallerArgumentExpression("keyAccessor")] string? indexName = null) + { + if (indexName is null) + { + throw new ArgumentNullException(nameof(indexName)); + } + + _result.AddIndex(new FullTextIndex(keyAccessor, indexName)); + + return this; + } + + /// + /// Configures the to have a prefix index based on a secondary key that + /// supports fuzzy search on string startswith queries. The secondary key can be any expression that does not change while + /// the element is within the indexed set. The name of the index is based on the + /// string representation of the expression and passed by the compiler to . + /// The convention is to always use x as a lambda parameter: x => x.StringProp1. Alternativly, you can also always use the same method from a static class. + /// + /// Accessor for the indexed property. The expression as a string is used as an identifier for the index. + /// Hence, the convention is to always use x as an identifier in case a lambda expression is used. + /// The name of the index. Usually, you should not specify this as the expression in is automatically passed by the compiler. + /// The instance on which this method is called is returned to support the fluent syntax. + public virtual IndexedSetBuilder WithPrefixIndex(Func> keyAccessor, [CallerArgumentExpression("keyAccessor")] string? indexName = null) + { + if (indexName is null) + { + throw new ArgumentNullException(nameof(indexName)); + } + + _result.AddIndex(new PrefixIndex(keyAccessor, indexName)); + + return this; + } + /// /// Builds and returns the configured /// @@ -240,6 +286,20 @@ public override IndexedSetBuilder WithUniqueIndex + public override IndexedSetBuilder WithFullTextIndex(Func> keyAccessor, [CallerArgumentExpression("keyAccessor")] string? indexName = null) + { + _ = base.WithFullTextIndex(keyAccessor, indexName); + return this; + } + + /// + public override IndexedSetBuilder WithPrefixIndex(Func> keyAccessor, [CallerArgumentExpression("keyAccessor")] string? indexName = null) + { + _ = base.WithPrefixIndex(keyAccessor, indexName); + return this; + } + /// public override IndexedSet Build() { diff --git a/Akade.IndexedSet/Indices/FullTextIndex.cs b/Akade.IndexedSet/Indices/FullTextIndex.cs new file mode 100644 index 0000000..1aaa1a2 --- /dev/null +++ b/Akade.IndexedSet/Indices/FullTextIndex.cs @@ -0,0 +1,78 @@ +using Akade.IndexedSet.DataStructures; +using Akade.IndexedSet.Utils; + +namespace Akade.IndexedSet.Indices; +internal class FullTextIndex : TypedIndex> +{ + private readonly SuffixTrie _suffixTrie; + private readonly Func> _keyAccessor; + + public FullTextIndex(Func> keyAccessor, string name) : base(name) + { + _keyAccessor = keyAccessor; + _suffixTrie = new(); + } + + public override void Add(TElement value) + { + ReadOnlyMemory key = _keyAccessor(value); + _ = _suffixTrie.Add(key.Span, value); + } + + public override void Remove(TElement value) + { + ReadOnlyMemory key = _keyAccessor(value); + _ = _suffixTrie.Remove(key.Span, value); + } + + internal override TElement Single(ReadOnlyMemory indexKey) + { + return _suffixTrie.GetAll(indexKey.Span).Single(); + } + + internal override bool TryGetSingle(ReadOnlyMemory indexKey, out TElement? element) + { + IEnumerable allMatches = _suffixTrie.Get(indexKey.Span); + element = default; + + IEnumerator enumerator = allMatches.GetEnumerator(); + + if (!enumerator.MoveNext()) + { + return false; + } + + element = enumerator.Current; + + return !enumerator.MoveNext(); + } + + internal override IEnumerable Where(ReadOnlyMemory indexKey) + { + return _suffixTrie.Get(indexKey.Span); + } + + internal override IEnumerable StartsWith(ReadOnlyMemory indexKey) + { + return _suffixTrie.GetAll(indexKey.Span) + .Where(candidate => _keyAccessor(candidate).Span.StartsWith(indexKey.Span)) + .Distinct(); + } + + internal override IEnumerable FuzzyStartsWith(ReadOnlyMemory indexKey, int maxDistance) + { + return _suffixTrie.FuzzySearch(indexKey.Span, maxDistance, false) + .Where(candidate => LevenshteinDistance.FuzzyMatch(_keyAccessor(candidate).Span[..indexKey.Length], indexKey.Span, maxDistance)) + .Distinct(); + } + + internal override IEnumerable FuzzyContains(ReadOnlyMemory indexKey, int maxDistance) + { + return _suffixTrie.FuzzySearch(indexKey.Span, maxDistance, false).Distinct(); + } + + internal override IEnumerable Contains(ReadOnlyMemory indexKey) + { + return _suffixTrie.GetAll(indexKey.Span).Distinct(); + } +} diff --git a/Akade.IndexedSet/Indices/PrefixIndex.cs b/Akade.IndexedSet/Indices/PrefixIndex.cs new file mode 100644 index 0000000..dd667a5 --- /dev/null +++ b/Akade.IndexedSet/Indices/PrefixIndex.cs @@ -0,0 +1,63 @@ +using Akade.IndexedSet.DataStructures; + +namespace Akade.IndexedSet.Indices; +internal class PrefixIndex : TypedIndex> +{ + private readonly Trie _trie; + private readonly Func> _keyAccessor; + + public PrefixIndex(Func> keyAccessor, string name) : base(name) + { + _keyAccessor = keyAccessor; + _trie = new(); + } + + public override void Add(TElement value) + { + ReadOnlyMemory key = _keyAccessor(value); + _ = _trie.Add(key.Span, value); + } + + public override void Remove(TElement value) + { + ReadOnlyMemory key = _keyAccessor(value); + _ = _trie.Remove(key.Span, value); + } + + internal override TElement Single(ReadOnlyMemory indexKey) + { + return _trie.GetAll(indexKey.Span).Single(); + } + + internal override bool TryGetSingle(ReadOnlyMemory indexKey, out TElement? element) + { + IEnumerable allMatches = _trie.Get(indexKey.Span); + element = default; + + IEnumerator enumerator = allMatches.GetEnumerator(); + + if (!enumerator.MoveNext()) + { + return false; + } + + element = enumerator.Current; + + return !enumerator.MoveNext(); + } + + internal override IEnumerable Where(ReadOnlyMemory indexKey) + { + return _trie.Get(indexKey.Span); + } + + internal override IEnumerable StartsWith(ReadOnlyMemory indexKey) + { + return _trie.GetAll(indexKey.Span); + } + + internal override IEnumerable FuzzyStartsWith(ReadOnlyMemory indexKey, int maxDistance) + { + return _trie.FuzzySearch(indexKey.Span, maxDistance, false); + } +} diff --git a/Akade.IndexedSet/Indices/TypedIndex.cs b/Akade.IndexedSet/Indices/TypedIndex.cs index 292637d..28ce09f 100644 --- a/Akade.IndexedSet/Indices/TypedIndex.cs +++ b/Akade.IndexedSet/Indices/TypedIndex.cs @@ -15,6 +15,8 @@ protected TypedIndex(string name) : base(name) internal abstract IEnumerable Where(TIndexKey indexKey); + internal abstract bool TryGetSingle(TIndexKey indexKey, out TElement? element); + internal virtual IEnumerable Range(TIndexKey start, TIndexKey end, bool inclusiveStart, bool inclusiveEnd) { throw new NotSupportedException($"Range queries are not supported on {GetType().Name}-indices. Use a range index to support this scenario."); @@ -70,5 +72,23 @@ internal virtual IEnumerable OrderByDescending(int skip) throw new NotSupportedException($"OrderByDescending queries are not supported on {GetType().Name}-indices. Use a range index to support this scenario."); } - internal abstract bool TryGetSingle(TIndexKey indexKey, out TElement? element); + internal virtual IEnumerable FuzzyStartsWith(ReadOnlyMemory indexKey, int maxDistance) + { + throw new NotSupportedException($"Fuzzy starts with queries are not supported on {GetType().Name}-indices. Use a full text or prefix index to support this scenario."); + } + + internal virtual IEnumerable FuzzyContains(ReadOnlyMemory indexKey, int maxDistance) + { + throw new NotSupportedException($"Fuzzy contains queries are not supported on {GetType().Name}-indices. Use a full text or prefix index to support this scenario."); + } + + internal virtual IEnumerable StartsWith(ReadOnlyMemory indexKey) + { + throw new NotSupportedException($"Fuzzy queries are not supported on {GetType().Name}-indices. Use a full text or or prefix to support this scenario."); + } + + internal virtual IEnumerable Contains(ReadOnlyMemory indexKey) + { + throw new NotSupportedException($"Contain queries are not supported on {GetType().Name}-indices. Use a full text to support this scenario."); + } } diff --git a/Akade.IndexedSet/Utils/LevenshteinDistance.cs b/Akade.IndexedSet/Utils/LevenshteinDistance.cs new file mode 100644 index 0000000..08554f5 --- /dev/null +++ b/Akade.IndexedSet/Utils/LevenshteinDistance.cs @@ -0,0 +1,53 @@ +namespace Akade.IndexedSet.Utils; + +internal static class LevenshteinDistance +{ + /// + /// Returns true if the strings have a levenshein distance smaller than . + /// Does not calculate the entire distance if the minimum distance is already bigger. + /// + public static bool FuzzyMatch(ReadOnlySpan a, ReadOnlySpan b, int maxDistance) + { + int wordLength = a.Length; + + if (a.Length == 0 && b.Length == 0) + { + return true; + } + + if (Math.Abs(a.Length - b.Length) > maxDistance) + { + return false; + } + + int[] currentRow = new int[wordLength + 1]; + for (int i = 0; i < currentRow.Length; i++) + { + currentRow[i] = i; + } + + for (int j = 0; j < b.Length; j++) + { + int[] lastRow = currentRow; + currentRow = new int[wordLength + 1]; + currentRow[0] = lastRow[0] + 1; + + int minDistance = currentRow[0]; + + for (int i = 1; i < currentRow.Length; i++) + { + int insertOrDeletion = Math.Min(currentRow[i - 1] + 1, lastRow[i] + 1); + int replacement = a[i - 1] == b[j] ? lastRow[i - 1] : lastRow[i - 1] + 1; + currentRow[i] = Math.Min(insertOrDeletion, replacement); + minDistance = Math.Min(minDistance, currentRow[i]); + } + + if (minDistance > maxDistance) + { + return false; + } + } + + return true; + } +} diff --git a/README.md b/README.md index f7726e2..0795008 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,25 @@ Provides an In-Memory data structure, the IndexedSet, that allows to easily add indices to allow efficient querying. Based on often seeing inefficient usage of `.FirstOrDefault`, `.Where`, `.Single` etc... and implementing data-structures to improve those queries for every project I'm on. + + - [Overview](#overview) + - [Performance / Operation-Support of the different indices:](#performance-operation-support-of-the-different-indices) + - [General queries](#general-queries) + - [String queries](#string-queries) + - [Features](#features) + - [Unique index (single entity, single key)](#unique-index-single-entity-single-key) + - [Non-unique index (multiple entities, single key)](#non-unique-index-multiple-entities-single-key) + - [Non-unique index (multiple entities, multiple keys)](#non-unique-index-multiple-entities-multiple-keys) + - [Range index](#range-index) + - [String indices & fuzzy matching](#string-indices-fuzzy-matching) + - [Computed or compound key](#computed-or-compound-key) + - [Reflection- & expression-free - convention-based index naming](#reflection-expression-free-convention-based-index-naming) + - [Updating key-values](#updating-key-values) + - [FAQs](#faqs) + - [How do I use multiple index types for the same property?](#how-do-i-use-multiple-index-types-for-the-same-property) + - [Roadmap](#roadmap) + + ## Overview A sample showing different queries as you might want do for a report: @@ -35,24 +54,41 @@ _ = set.MaxBy(x => x.Amount * x.UnitPrice); _ = set.Where(x => (x.ProductId, x.UnitPrice), (4, 10)); ``` -Performance / Operation-Support of the different indices: +### Performance / Operation-Support of the different indices: - n: total number of elements - m: number of elements in the return set +- ✔: Supported +- ⚠: Supported but throws if not exactly 1 item was found +- ❌: Not-supported + +#### General queries + +| Query | Unique-Index | NonUnique-Index | Range-Index | +| --------- | ------------ | --------------- | --------------- | +| Single | ⚠ O(1) | ⚠ O(1) | ⚠ O(log n) | +| Where | ✔ O(1) | ✔ O(m) | ✔ O(log n + m) | +| Range | ❌ | ❌ | ✔ O(log n + m) | +| < / <= | ❌ | ❌ | ✔ O(log n + m) | +| > / >= | ❌ | ❌ | ✔ O(log n + m) | +| OrderBy | ❌ | ❌ | ✔ O(m) | +| Max/Min | ❌ | ❌ | ✔ O(1) | + +#### String queries + +- w: length of query word +- D: maximum distance in fuzzy query -| Query | Unique-Index | NonUnique-Index | Range-Index | -| ------ | ------------ | --------------- | --------------- | -| Single | ⚠ O(1) | ⚠ O(1) | ⚠ O(log n) | -| Where | ✔ O(1) | ✔ O(m) | ✔ O(log n + m) | -| Range | ❌ | ❌ | ✔ O(log n + m) | -| < / <= | ❌ | ❌ | ✔ O(log n + m) | -| > / >= | ❌ | ❌ | ✔ O(log n + m) | -| OrderBy | ❌ | ❌ | ✔ O(m) | -| Max/Min | ❌ | ❌ | ✔ O(1) | +| Query | Prefix-Index | FullText-Index | +| ----------------| ------------ | ---------------| +| StartWith | ⚠ O(w) | ⚠ O(w) | +| Contains | ❌ | ✔ O(w) | +| Fuzzy StartWith | ⚠ O(w+D) | ⚠ O(w+D) | +| Fuzzy Contains | ❌ | ✔ O(w+D) | + +> ℹ FullText indices use a lot more memory than prefix indices and are more expensive to construct. Only +use FullText indices if you really require it. -✔: Supported -⚠: Supported but throws if not exactly 1 item was found -❌: Not-supported ## Features This project aims to provide a data structure (*it's not a DB!*) that allows to easily setup fast access on different properties: @@ -73,7 +109,9 @@ Data data = set[1]; data = set.Single(x => x.SecondaryKey, 5); ``` -> ℹ Entities do not require a primary key. `IndexedSet` inherits from `IndexedSet` but provides convenient access to the automatically added unique index: `set[primaryKey]` instead of `set.Single(x => x.PrimaryKey, primaryKey)`. +> ℹ Entities do not require a primary key. `IndexedSet` inherits from `IndexedSet` +but provides convenient access to the automatically added unique index: `set[primaryKey]` instead +of `set.Single(x => x.PrimaryKey, primaryKey)`. ### Non-unique index (multiple entities, single key) @@ -138,6 +176,27 @@ data = set.LessThan(x => x.SecondaryKey, 4); data = set.OrderBy(x => x.SecondaryKey, skip: 10).Take(10); // second page of 10 elements ``` +### String indices & fuzzy matching +Prefix- & Suffix-Trie based indices for efficient StartWith & String-Contains queries including support +for fuzzy matching. + +```csharp +IndexedSet data = typeof(object).Assembly.GetTypes() + .ToIndexedSet() + .WithPrefixIndex(x => x.Name.AsMemory()) + .WithFullTextIndex(x => x.FullName.AsMemory()) + .Build(); + +// fast prefix or contains queries via indices +_ = data.StartsWith(x => x.Name.AsMemory(), "Int".AsMemory()); +_ = data.Contains(x => x.FullName.AsMemory(), "Int".AsMemory()); + +// fuzzy searching is supported by prefix and full text indices +// the following will also match "String" +_ = data.FuzzyStartsWith(x => x.Name.AsMemory(), "Strang".AsMemory(), 1); +_ = data.FuzzyContains(x => x.FullName.AsMemory(), "Strang".AsMemory(), 1); +``` + ### Computed or compound key The data structure also allows to use computed or compound keys: @@ -200,7 +259,7 @@ Potential features (not ordered): - [ ] Thread-safe version - [ ] Easier updating of keys - [ ] Events for changed values -- [ ] More index types (Trie) +- [x] More index types (Trie) - [ ] Tree-based range index for better insertion performance - [ ] Analyzers to help with best practices - [x] Range insertion and corresponding `.ToIndexedSet().WithIndex(x => ...).[...].Build()`