Skip to content

Commit

Permalink
todo: feat(lists): add support for wildcard lists using a custom Trie
Browse files Browse the repository at this point in the history
A couple other Trie implementations were tested but they use more
memory and are slower. See PR #<FIXME> for details.
  • Loading branch information
ThinkChaos committed Nov 11, 2023
1 parent 7547e02 commit a4e993e
Show file tree
Hide file tree
Showing 13 changed files with 891,967 additions and 22 deletions.
7 changes: 7 additions & 0 deletions cache/stringcache/in_memory_grouped_cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ func NewInMemoryGroupedRegexCache() *InMemoryGroupedCache {
}
}

func NewInMemoryGroupedWildcardCache() *InMemoryGroupedCache {
return &InMemoryGroupedCache{
caches: make(map[string]stringCache),
factoryFn: newWildcardCacheFactory,
}
}

func (c *InMemoryGroupedCache) ElementCount(group string) int {
c.lock.RLock()
cache, found := c.caches[group]
Expand Down
75 changes: 70 additions & 5 deletions cache/stringcache/string_caches.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"strings"

"github.com/0xERR0R/blocky/log"
"github.com/0xERR0R/blocky/trie"
)

type stringCache interface {
Expand Down Expand Up @@ -86,7 +87,7 @@ func (s *stringCacheFactory) insertString(entry string) {
ix := sort.SearchStrings(bucket, normalized)

if !(ix < len(bucket) && bucket[ix] == normalized) {
// extent internal bucket
// extend internal bucket
bucket = append(s.getBucket(entryLen), "")

// move elements to make place for the insertion
Expand All @@ -99,8 +100,7 @@ func (s *stringCacheFactory) insertString(entry string) {
}

func (s *stringCacheFactory) addEntry(entry string) {
// skip empty strings and regex
if len(entry) > 0 && !isRegex(entry) {
if len(entry) > 0 && !isRegex(entry) && !isWildcard(entry) {
s.cnt++
s.insertString(entry)
}
Expand All @@ -112,8 +112,6 @@ func (s *stringCacheFactory) create() stringCache {
cache[k] = strings.Join(v, "")
}

s.tmp = nil

return cache
}

Expand Down Expand Up @@ -196,3 +194,70 @@ func (r *regexCacheFactory) create() stringCache {
func newRegexCacheFactory() cacheFactory {
return new(regexCacheFactory)
}

func isWildcard(s string) bool {
return strings.ContainsRune(s, '*')
}

type wildcardCache struct {
trie trie.Trie
cnt int
}

func (cache wildcardCache) elementCount() int {
return cache.cnt
}

func (cache wildcardCache) contains(domain string) bool {
return cache.trie.HasParentOf(domain)
}

type wildcardCacheFactory struct {
trie *trie.Trie
cnt int
}

func newWildcardCacheFactory() cacheFactory {
return &wildcardCacheFactory{
trie: trie.NewTrie(trie.SplitTLD),
}
}

func (r *wildcardCacheFactory) addEntry(entry string) {
globCount := strings.Count(entry, "*")
if globCount == 0 {
return
}

if !strings.HasPrefix(entry, "*.") || globCount > 1 {
log.Log().Warnf("unsupported wildcard '%s': must start with '*.' and contain no other '*'", entry)

return
}

entry = normalizeWildcard(entry)

r.trie.Insert(entry)
r.cnt++
}

func (r *wildcardCacheFactory) count() int {
return r.cnt
}

func (r *wildcardCacheFactory) create() stringCache {
if r.cnt == 0 {
return nil
}

return wildcardCache{*r.trie, r.cnt}
}

func normalizeWildcard(domain string) string {
domain = normalizeEntry(domain)
domain = strings.TrimLeft(domain, "*")
domain = strings.Trim(domain, ".")
domain = strings.ToLower(domain)

return domain
}
184 changes: 167 additions & 17 deletions cache/stringcache/string_caches_benchmark_test.go
Original file line number Diff line number Diff line change
@@ -1,44 +1,194 @@
package stringcache

import (
"math/rand"
"context"
"math"
"os"
"runtime"
"runtime/debug"
"testing"

"github.com/0xERR0R/blocky/lists/parsers"
)

func BenchmarkStringCache(b *testing.B) {
testdata := createTestdata(10_000)
var (
stringTestData []string
wildcardTestData []string

baseMemStats runtime.MemStats
)

func init() { //nolint:gochecknoinits
// If you update either list, make sure both are the list version (see file header).
stringTestData = loadTestdata("../../helpertest/data/oisd-big-plain.txt")

// String and Wildcard benchmarks don't use the exact same data,
// but since it's two versions of the same list it's closer to
// the real world.
// For the same data, a Trie uses slightly more memory but searches are much faster.
if true { // switch to same data
wildcardTestData = loadTestdata("../../helpertest/data/oisd-big-wildcard.txt")
} else {
wildcardTestData = make([]string, 0, len(stringTestData))

for _, domain := range stringTestData {
wildcardTestData = append(wildcardTestData, "*."+domain)
}
}
}

// --- Cache Building ---
//
// Most memory efficient: Wildcard (blocky/trie)
// Fastest: Wildcard (blocky/trie)
//
//nolint:lll
// BenchmarkStringFactory-8 6 174 514 565 ns/op 11.81 fact_heap_MB 26.93 peak_heap_MB 67 621 648 B/op 1 304 allocs/op
// BenchmarkWildcardFactory-8 18 59 718 953 ns/op 16.52 fact_heap_MB 16.52 peak_heap_MB 26 624 735 B/op 92 071 allocs/op

func BenchmarkStringFactory(b *testing.B) {
benchmarkStringFactory(b, newStringCacheFactory)
}

func BenchmarkWildcardFactory(b *testing.B) {
benchmarkWildcardFactory(b, newWildcardCacheFactory)
}

func benchmarkStringFactory(b *testing.B, newFactory func() cacheFactory) {
benchmarkFactory(b, stringTestData, newFactory)
}

func benchmarkWildcardFactory(b *testing.B, newFactory func() cacheFactory) {
benchmarkFactory(b, wildcardTestData, newFactory)
}

func benchmarkFactory(b *testing.B, data []string, newFactory func() cacheFactory) {
baseMemStats = readMemStats()

b.ReportAllocs()

var (
factory cacheFactory
cache stringCache
)

for i := 0; i < b.N; i++ {
factory := newStringCacheFactory()
factory = newFactory()

for _, s := range testdata {
for _, s := range data {
factory.addEntry(s)
}

factory.create()
cache = factory.create()
}

b.StopTimer()
reportMemUsage(b, "peak", factory, cache)
reportMemUsage(b, "fact", factory) // cache will be GC'd
}

// --- Cache Querying ---
//
// Most memory efficient: Wildcard (blocky/trie)
// Fastest: Wildcard (blocky/trie)
//
//nolint:lll
// BenchmarkStringCache-8 6 203 026 476 ns/op 15.14 heap_MB 0 B/op 0 allocs/op
// BenchmarkWildcardCache-8 38 30 201 633 ns/op 16.54 heap_MB 0 B/op 0 allocs/op

func BenchmarkStringCache(b *testing.B) {
benchmarkStringCache(b, newStringCacheFactory)
}

func BenchmarkWildcardCache(b *testing.B) {
benchmarkWildcardCache(b, newWildcardCacheFactory)
}

func benchmarkStringCache(b *testing.B, newFactory func() cacheFactory) {
benchmarkCache(b, stringTestData, newFactory)
}

func benchmarkWildcardCache(b *testing.B, newFactory func() cacheFactory) {
benchmarkCache(b, wildcardTestData, newFactory)
}

func benchmarkCache(b *testing.B, data []string, newFactory func() cacheFactory) {
baseMemStats = readMemStats()

factory := newFactory()

for _, s := range data {
factory.addEntry(s)
}

cache := factory.create()

b.ReportAllocs()
b.ResetTimer()

for i := 0; i < b.N; i++ {
for _, s := range data {
if !cache.contains(s) {
b.Fatalf("cache is missing value that was previously inserted: %s", s)
}
}
}

b.StopTimer()
reportMemUsage(b, "cache", cache)
}

// ---

func readMemStats() (res runtime.MemStats) {
runtime.GC()
debug.FreeOSMemory()

runtime.ReadMemStats(&res)

return res
}

func randString(n int) string {
const charPool = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-."
// toMeasure: keep these objects allocated during measuring
func reportMemUsage(b *testing.B, prefix string, toMeasure ...any) {
m := readMemStats()

b := make([]byte, n)
b.ReportMetric(toMB(m.HeapAlloc-baseMemStats.HeapAlloc), prefix+"_heap_MB")

for i := range b {
b[i] = charPool[rand.Intn(len(charPool))]
// Forces Go to keep `toMeasure` and its contents allocated
// You can tell it works because factory benchmarks have different values for both calls
for i := range toMeasure {
toMeasure[i] = nil
}
}

func toMB(b uint64) float64 {
const bytesInKB = float64(1024)

kb := float64(b) / bytesInKB

return string(b)
return math.Round(kb) / 1024
}

func createTestdata(count int) []string {
var result []string
func loadTestdata(path string) (res []string) {
f, err := os.Open(path)
if err != nil {
panic(err)
}
defer f.Close()

p := parsers.Hosts(f)

err = parsers.ForEach[*parsers.HostsIterator](context.Background(), p, func(hosts *parsers.HostsIterator) error {
return hosts.ForEach(func(host string) error {
res = append(res, host)

for i := 0; i < count; i++ {
result = append(result, randString(8+rand.Intn(20)))
return nil
})
})
if err != nil {
panic(err)
}

return result
return res
}
44 changes: 44 additions & 0 deletions cache/stringcache/string_caches_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,4 +73,48 @@ var _ = Describe("Caches", func() {
})
})
})

Describe("Wildcard StringCache", func() {
BeforeEach(func() {
factory = newWildcardCacheFactory()

factory.addEntry("*.example.com")
factory.addEntry("*.example.org")
factory.addEntry("*.blocked")
factory.addEntry("*.sub.blocked") // already handled by above

cache = factory.create()
})

It("should match if one regex in StringCache matches string", func() {
// first entry
Expect(cache.contains("example.com")).Should(BeTrue())
Expect(cache.contains("www.example.com")).Should(BeTrue())

// look alikes
Expect(cache.contains("com")).Should(BeFalse())
Expect(cache.contains("example.coma")).Should(BeFalse())
Expect(cache.contains("an-example.com")).Should(BeFalse())
Expect(cache.contains("examplecom")).Should(BeFalse())

// other entry
Expect(cache.contains("example.org")).Should(BeTrue())
Expect(cache.contains("www.example.org")).Should(BeTrue())

// unrelated
Expect(cache.contains("example.net")).Should(BeFalse())
Expect(cache.contains("www.example.net")).Should(BeFalse())

// third entry
Expect(cache.contains("blocked")).Should(BeTrue())
Expect(cache.contains("sub.blocked")).Should(BeTrue())
Expect(cache.contains("sub.sub.blocked")).Should(BeTrue())
Expect(cache.contains("example.blocked")).Should(BeTrue())
})

It("should return correct element count", func() {
Expect(factory.count()).Should(Equal(4))
Expect(cache.elementCount()).Should(Equal(4))
})
})
})
Loading

0 comments on commit a4e993e

Please sign in to comment.