forked from tylertreat/BoomFilters
-
Notifications
You must be signed in to change notification settings - Fork 0
/
topk.go
125 lines (104 loc) · 2.83 KB
/
topk.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package boom
import (
"bytes"
"container/heap"
)
type Element struct {
Data []byte
Freq uint64
}
// An elementHeap is a min-heap of elements.
type elementHeap []*Element
func (e elementHeap) Len() int { return len(e) }
func (e elementHeap) Less(i, j int) bool { return e[i].Freq < e[j].Freq }
func (e elementHeap) Swap(i, j int) { e[i], e[j] = e[j], e[i] }
func (e *elementHeap) Push(x interface{}) {
*e = append(*e, x.(*Element))
}
func (e *elementHeap) Pop() interface{} {
old := *e
n := len(old)
x := old[n-1]
*e = old[0 : n-1]
return x
}
// TopK uses a Count-Min Sketch to calculate the top-K frequent elements in a
// stream.
type TopK struct {
cms *CountMinSketch
k uint
n uint
elements *elementHeap
}
// NewTopK creates a new TopK backed by a Count-Min sketch whose relative
// accuracy is within a factor of epsilon with probability delta. It tracks the
// k-most frequent elements.
func NewTopK(epsilon, delta float64, k uint) *TopK {
elements := make(elementHeap, 0, k)
heap.Init(&elements)
return &TopK{
cms: NewCountMinSketch(epsilon, delta),
k: k,
elements: &elements,
}
}
// Add will add the data to the Count-Min Sketch and update the top-k heap if
// applicable. Returns the TopK to allow for chaining.
func (t *TopK) Add(data []byte) *TopK {
t.cms.Add(data)
t.n++
freq := t.cms.Count(data)
if t.isTop(freq) {
t.insert(data, freq)
}
return t
}
// Elements returns the top-k elements from lowest to highest frequency.
func (t *TopK) Elements() []*Element {
if t.elements.Len() == 0 {
return make([]*Element, 0)
}
elements := make(elementHeap, t.elements.Len())
copy(elements, *t.elements)
heap.Init(&elements)
topK := make([]*Element, 0, t.k)
for elements.Len() > 0 {
topK = append(topK, heap.Pop(&elements).(*Element))
}
return topK
}
// Reset restores the TopK to its original state. It returns itself to allow
// for chaining.
func (t *TopK) Reset() *TopK {
t.cms.Reset()
elements := make(elementHeap, 0, t.k)
heap.Init(&elements)
t.elements = &elements
t.n = 0
return t
}
// isTop indicates if the given frequency falls within the top-k heap.
func (t *TopK) isTop(freq uint64) bool {
if t.elements.Len() < int(t.k) {
return true
}
return freq >= (*t.elements)[0].Freq
}
// insert adds the data to the top-k heap. If the data is already an element,
// the frequency is updated. If the heap already has k elements, the element
// with the minimum frequency is removed.
func (t *TopK) insert(data []byte, freq uint64) {
for _, element := range *t.elements {
if bytes.Compare(data, element.Data) == 0 {
// Element already in top-k.
element.Freq = freq
return
}
}
if t.elements.Len() == int(t.k) {
// Remove minimum-frequency element.
heap.Pop(t.elements)
}
// Add element to top-k.
heap.Push(t.elements, &Element{Data: data, Freq: freq})
}