Skip to content

Commit

Permalink
Add Bitap pattern matching (#458)
Browse files Browse the repository at this point in the history
  • Loading branch information
Kalkwst authored Jul 25, 2024
1 parent 327af3d commit b0838cb
Show file tree
Hide file tree
Showing 3 changed files with 363 additions and 0 deletions.
119 changes: 119 additions & 0 deletions Algorithms.Tests/Strings/PatternMatching/BitapTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
using System;
using Algorithms.Strings.PatternMatching;
using NUnit.Framework;

namespace Algorithms.Tests.Strings.PatternMatching;

[TestFixture]
public class BitapTests
{
[Test]
public void FindExactPattern_EmptyTextReturnsError()
{
Assert.That(Bitap.FindExactPattern("", "abc"), Is.EqualTo(-1));
}

[Test]
public void FindExactPattern_EmptyPatternReturnsZero()
{
Assert.That(Bitap.FindExactPattern("abc", ""), Is.EqualTo(0));
}

[Test]
public void FindExactPattern_PatternFoundAtBeginning()
{
Assert.That(Bitap.FindExactPattern("hello world", "hello"), Is.EqualTo(0));
}

[Test]
public void FindExactPattern_PatternFoundInTheMiddle()
{
Assert.That(Bitap.FindExactPattern("abcabc", "cab"), Is.EqualTo(2));
}

[Test]
public void FindExactPattern_PatternFoundAtEnd()
{
Assert.That(Bitap.FindExactPattern("the end", "end"), Is.EqualTo(4));
}

[Test]
public void FindExactPattern_PatternNotFound()
{
Assert.That(Bitap.FindExactPattern("abcdefg", "xyz"), Is.EqualTo(-1));
}

[Test]
public void FindExactPattern_PatternLongerThanText()
{
Assert.That(Bitap.FindExactPattern("short", "longerpattern"), Is.EqualTo(-1));
}

[Test]
public void FindExactPattern_OverlappingPatterns()
{
Assert.That(Bitap.FindExactPattern("ababab", "abab"), Is.EqualTo(0));
}

[Test]
public void FindExactPattern_PatternTooLongThrowsException()
{
var longPattern = new string('a', 32);
Assert.Throws<ArgumentException>(() => Bitap.FindExactPattern("some text", longPattern));
}

[Test]
public void FindExactPattern_SpecialCharactersInPattern()
{
Assert.That(Bitap.FindExactPattern("hello, world!", ", wo"), Is.EqualTo(5));
}

[Test]
public void FindFuzzyPattern_EmptyTextReturnsZero()
{
Assert.That(Bitap.FindFuzzyPattern("", "abc", 1), Is.EqualTo(0));
}

[Test]
public void FindFuzzyPattern_EmptyPatternReturnsZero()
{
Assert.That(Bitap.FindFuzzyPattern("def", "", 1), Is.EqualTo(0));
}

[Test]
public void FindFuzzyPattern_ExactMatchFound()
{
Assert.That(Bitap.FindFuzzyPattern("hello world", "hello", 0), Is.EqualTo(0));
}

[Test]
public void FindFuzzyPattern_FuzzyMatchWithOneMismatch()
{
Assert.That(Bitap.FindFuzzyPattern("hello world", "hellp", 1), Is.EqualTo(0));
}

[Test]
public void FindFuzzyPattern_FuzzyMatchWithMultipleMismatches()
{
Assert.That(Bitap.FindFuzzyPattern("abcde", "xbcdz", 2), Is.EqualTo(0));
}

[Test]
public void FindFuzzyPattern_FuzzyMatchAtEnd()
{
Assert.That(Bitap.FindFuzzyPattern("abcdefg", "efx", 1), Is.EqualTo(4));
}

[Test]
public void FindFuzzyPattern_FuzzyMatchNotFound()
{
Assert.That(Bitap.FindFuzzyPattern("abcdefg", "xyz", 2), Is.EqualTo(-1));
}

[Test]
public void FindFuzzyPattern_PatternTooLongReturnsNegativeOne()
{
var longPattern = new string('a', 32);
Assert.That(Bitap.FindFuzzyPattern("some text", longPattern, 1), Is.EqualTo(-1));
}
}
243 changes: 243 additions & 0 deletions Algorithms/Strings/PatternMatching/Bitap.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
using System;

namespace Algorithms.Strings.PatternMatching;

/// <summary>
/// The Bitap algorithm is a fuzzy string matching technique. It ains to find approximate matches of a pattern within a
/// text, allowing for a certain degree of mismatch (e.g., mistypes, minor variations etc.). It's knowd for its efficiency,
/// using bitwise operations for fast comparisons.
///
/// <para>
/// <b>How it works:</b>
/// <list type="number">
/// <item>
/// <term>Initialization</term>
/// <description>
/// Bitmasks are created for each character in the pattern. These bitmasks are essentially binary numbers where each bit
/// represents a specific character's position within the pattern. An initial state variable <c>R</c> is set to all 1s,
/// indicating that all characters in the pattern are initially unmatched.
/// </description>
/// </item>
/// <item>
/// <term>Iteration</term>
/// <description>
/// The algorithm iterates through each character in the text. For each character, the state <c>R</c> is updated using
/// bitwise operations (shifts and logical ORs). This update reflects whether the current character in the text matches
/// the corresponding character in the pattern.
/// </description>
/// </item>
/// <item>
/// <term>Matching</term>
/// <description>
/// After each iteration, the algorithm checks if the least significant bit of <c>R</c> is set to 1.
/// If it is, it means there's a potential match at that position, with a mismatch distance that's within the allowed
/// threshold.
/// </description>
/// </item>
/// </list>
/// </para>
/// <para>
/// <b> Finding Matches </b>
/// </para>
/// <para>
/// If the least significant bit of <c>R</c> is 1, it means a potential match is found.
/// The number of leading zeros in <c>R</c> indicates the mismatch distance.
/// If this distance is within the allowed threshold, it's considered a valid match.
/// </para>
/// </summary>
public static class Bitap
{
/// <summary>
/// <para>
/// This function implements the Bitap algorithm for finding exact matches of a pattern within a text.
/// It aims to find the first occurrence of the pattern in the text, allowing for no mismatches.
/// </para>
/// <para>
/// The algorithm iterates through each character in the text. For each character, the state <c>R</c> is updated using
/// bitwise operations (shifts and logical ORs). This update reflects whether the current character in the text matches
/// the corresponding character in the pattern.
/// </para>
/// <para>
/// After each iteration, the algorithm checks if the least significant bit of <c>R</c> is set to 1.
/// If it is, it means there's a potential match at that position, with a mismatch distance of 0.
/// The function returns the index of the first occurrence of the pattern in the text, or -1 if not found.
/// </para>
/// <para>
/// The function throws an <see cref="ArgumentException"/> if the pattern is longer than 31 characters.
/// This is because the maximum length of the pattern is 31, because if it's longer than that,
/// we won't be able to represent the pattern mask in an int.
/// </para>
/// </summary>
/// <param name="text">The text to search in.</param>
/// <param name="pattern">The pattern to search for.</param>
/// <returns>The index of the first occurrence of the pattern in the text, or -1 if not found.</returns>
/// <exception cref="ArgumentException">The pattern is longer than 31 characters.</exception>
public static int FindExactPattern(string text, string pattern)
{
// The length of the pattern.
var len = pattern.Length;

// An array of integers that will be used to mask the pattern.
// The pattern mask is a bitmask that we will use to search for the pattern characters
// in the text. We'll set the bit corresponding to the character in the pattern
// to 0, and then use bitwise operations to check for the pattern.
var patternMask = new int[128];
int index;

// Check if the pattern is empty.
if (string.IsNullOrEmpty(pattern))
{
return 0;
}

// Check if the pattern is longer than 31 characters.
if (len > 31)
{
throw new ArgumentException("The pattern is longer than 31 characters.");
}

// Initialize the register <c>R</c> to all 1s.
var r = ~1;

// Initialize the pattern mask to all 1s.
for (index = 0; index <= 127; ++index)
{
patternMask[index] = ~0;
}

// Set the bits corresponding to the characters in the pattern to 0 in the pattern mask.
for (index = 0; index < len; ++index)
{
patternMask[pattern[index]] &= ~(1 << index);
}

// Iterate through each character in the text.
for (index = 0; index < text.Length; ++index)
{
// Update the state <c>R</c> by ORing the pattern mask with the character in the text,
// and then shift it to the left by 1.
r |= patternMask[text[index]];
r <<= 1;

// Check if the least significant bit of <c>R</c> is set to 1.
// If there's a potential match at that position, with a mismatch distance of 0,
// return the index of the first occurrence of the pattern in the text.
if ((r & 1 << len) == 0)
{
return index - len + 1;
}
}

// If no match is found, return -1.
return -1;
}

/// <summary>
/// Finds the first occurrence of a pattern in a given text with a given threshold for mismatches.
/// </summary>
/// <param name="text">The text to search in.</param>
/// <param name="pattern">The pattern to search for.</param>
/// <param name="threshold">The maximum number of mismatches allowed.</param>
/// <returns>The index of the first occurrence of the pattern in the text, or -1 if not found.</returns>
public static int FindFuzzyPattern(string text, string pattern, int threshold)
{
// Create a pattern mask for each character in the pattern.
// The pattern mask is a bitmask that we will use to search for the pattern characters
// in the text. We'll set the bit corresponding to the character in the pattern
// to 0, and then use bitwise operations to check for the pattern.
var patternMask = new int[128];

// Create a register array.
// The register array is used to keep track of the pattern mask as we search for the pattern.
// We'll start with a register that has all bits set to 1, because all bits in the pattern mask
// will be set to 1 initially.
var r = new int[(threshold + 1) * sizeof(int)];

var len = pattern.Length;

// Check for empty strings.
// If the text is empty, return 0.
// If the pattern is empty, return 0.
if (string.IsNullOrEmpty(text))
{
return 0;
}

if (string.IsNullOrEmpty(pattern))
{
return 0;
}

// Check for a pattern that is too long.
// If the pattern is longer than 31 characters, return -1.
// The maximum length of the pattern is 31, because if it's longer than that,
// we won't be able to represent the pattern mask in an int.
if (len > 31)
{
return -1;
}

// Initialize the register.
// Set the least significant bit in the register to 0 or 1
// depending on whether the current character in the text matches the pattern.
// This will make it easier to check for the pattern later.
for (var i = 0; i <= threshold; ++i)
{
r[i] = ~1;
}

// Initialize the pattern mask.
// Set the bit corresponding to each character in the pattern to 0 in the pattern mask.
// This will make it easier to check for the pattern later.
for (var i = 0; i <= 127; i++)
{
patternMask[i] = ~0;
}

// Set the pattern mask for each character in the pattern.
// Use bitwise AND to clear the bit corresponding to the current character.
for (var i = 0; i < len; ++i)
{
patternMask[pattern[i]] &= ~(1 << i);
}

// Search for the pattern in the text.
// Loop through each character in the text.
for (var i = 0; i < text.Length; ++i)
{
// Update the register.
// Set the least significant bit in the register to 0 or 1
// depending on whether the current character in the text matches the pattern.
// This will make it easier to check for the pattern later.
var oldR = r[0];

r[0] |= patternMask[text[i]];
r[0] <<= 1;

// Update the other registers.
// Set the least significant bit in each register to 0 or 1
// depending on whether the current character in the text matches the pattern.
// This will make it easier to check for the pattern later.
for (var j = 1; j <= threshold; ++j)
{
var tmp = r[j];

r[j] = (oldR & (r[j] | patternMask[text[i]])) << 1;
oldR = tmp;
}

// If the pattern has been found, return the index.
// Check the most significant bit in the register.
// If it's 0, then the pattern has been found.
if ((r[threshold] & 1 << len) == 0)
{
// The pattern has been found.
// Return the index of the first character in the pattern.
return i - len + 1;
}
}

// The pattern has not been found.
return -1;
}
}
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ find more than one implementation for the same objective but using different alg
* [Jaro Similarity](./Algorithms/Strings/Similarity/JaroSimilarity.cs)
* [Jaro-Winkler Distance](./Algorithms/Strings/Similarity/JaroWinklerDistance.cs)
* [Pattern Matching](./Algorithms/Strings/PatternMatching/)
* [Bitop Pattern Matching](./Algorithms/Strings/PatternMatching/Bitap.cs)
* [Naive String Search](./Algorithms/Strings/PatternMatching/NaiveStringSearch.cs)
* [Rabin Karp](./Algorithms/Strings/PatternMatching/RabinKarp.cs)
* [Boyer Moore](./Algorithms/Strings/PatternMatching/BoyerMoore.cs)
Expand Down

0 comments on commit b0838cb

Please sign in to comment.