diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000..2b96286
--- /dev/null
+++ b/.github/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at info@pycm.ir. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 0000000..929da11
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,16 @@
+# Contribution
+
+Changes and improvements are more than welcome! ❤️ Feel free to fork and open a pull request.
+
+
+Please consider the following :
+
+
+1. Fork it!
+2. Add your functions/methods to proper files
+3. Add standard `docstring` to your functions/methods
+4. Add tests for your functions/methods (`doctest` testcases in `Test` folder)
+5. Pass all CI tests
+6. Update `CHANGELOG.md`
+ - Describe changes under `[Unreleased]` section
+7. Submit a pull request (please complete the pull request template)
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000..18991f1
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE.md
@@ -0,0 +1,12 @@
+#### Description
+
+#### Steps/Code to Reproduce
+
+#### Expected Behavior
+
+#### Actual Behavior
+
+#### Operating System
+
+#### Compiler Version
+
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 0000000..ded12d6
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,6 @@
+#### Reference Issues/PRs
+
+#### What does this implement/fix? Explain your changes.
+
+#### Any other comments?
+
diff --git a/AUTHORS.md b/AUTHORS.md
new file mode 100644
index 0000000..823beee
--- /dev/null
+++ b/AUTHORS.md
@@ -0,0 +1,4 @@
+# Authors #
+
+----------
+- Martin Khannouz - Concordia University ([m_khanno@encs.concordia.ca](mailto:m_khanno@encs.concordia.ca))
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..5c63822
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,14 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+### Added
+- Reservoir Sampling
+- Chained Reservoir Sampling
+- Micro-Cluster Nearest Neighbour (MC-NN)
+- Lightweight Temporal Compression (LTC)
+- Bloom Filter
+- Cuckoo Filter
diff --git a/Makefile b/Makefile
index 07e13d3..81c13d0 100644
--- a/Makefile
+++ b/Makefile
@@ -8,26 +8,40 @@ CPPOBJECT=$(TEST_DIR)/test_bloom.oo\
$(TEST_DIR)/test_chained_reservoir.oo\
$(TEST_DIR)/test_ltc.oo\
$(TEST_DIR)/test_mc_nn.oo
-FLAGS=-g
-FLAGS_PERF=-O3
+
+FLAG_GCOV=-fprofile-arcs -ftest-coverage
+
+ifeq ($(config), debug)
+CFLAGS=-DDEBUG -g -O0 $(FLAG_GCOV)
+else #release config by default
+CFLAGS=-Os -O3
+endif
all: $(OBJECT) main.cpp
- g++ -I$(SRC_DIR) -std=c++11 main.cpp $(OBJECT) $(FLAGS) -o $(EXE)
+ g++ -I$(SRC_DIR) -std=c++11 main.cpp $(OBJECT) $(CFLAGS) -o $(EXE)
test: $(CPPOBJECT) $(TEST_DIR)/test.cpp
- g++ -I$(SRC_DIR) -std=c++11 -fpermissive $(TEST_DIR)/test.cpp $(CPPOBJECT) $(FLAGS) -o $(EXE)-test -lgtest -lpthread
+ g++ -I$(SRC_DIR) -std=c++11 -fpermissive $(TEST_DIR)/test.cpp $(CPPOBJECT) $(CFLAGS) -o $(EXE)-test -lgtest -lpthread -lgcov
perf: $(OBJECT)
- g++ -I$(SRC_DIR) -std=c++11 main-performance.cpp $(OBJECT) $(FLAGS_PERF) -o $(EXE)-perf
+ g++ -I$(SRC_DIR) -std=c++11 main-performance.cpp $(OBJECT) $(CFLAGS) -o $(EXE)-perf
run_test: test
./$(EXE)-test
+coverage: run_test
+ mkdir coverage
+ gcov test
+ lcov -c --directory . --output-file coverage.info --no-external
+ genhtml coverage.info --output-directory coverage
+
%.o: %.c
gcc -std=c99 $< -c -o $@
%.oo: %.cpp
- g++ -I$(SRC_DIR) -std=c++11 $(FLAGS) $< -c -o $@ -fpermissive
+ g++ -I$(SRC_DIR) -std=c++11 $(CFLAGS) $< -c -o $@ -fpermissive
clean:
rm -f *.o *.oo $(TEST_DIR)/*.oo $(SRC_DIR)/*.oo $(EXE) $(EXE)-test $(EXE)-perf
+ rm -rf coverage
+ rm -f test.gcda test.gcno $(TEST_DIR)/*.gcda $(TEST_DIR)/*.gcno coverage.info
diff --git a/README.md b/README.md
index 8548ee2..1c4c32e 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,21 @@
-[![Build Status](https://travis-ci.org/azazel7/OrpailleCC.svg?branch=master)](https://travis-ci.org/azazel7/OrpailleCC)
+
+
+
OrpailleCC
+
+
+
OrpailleCC is data stream library written in C++. It provides a consistent
-collection of data stream algorithms for embedded devices such as sensors.
+collection of data stream algorithms for embedded devices. The goal of
+OrpailleCC is to support research on data stream mining for connected objects,
+by facilitating the comparison and benchmarking of algorithms in a consistent
+framework. It also enables programmers of embedded systems to use
+out-of-the-box algorithms with an efficient implementation.
+Algorithms from OrpailleCC are based on C++ templates and does not use the STL library.
-The library is based on C++ templates and does not use the STL library. To start
-using a feature, just include the header files in your project and compile your
-project.
-
-# Get started
-## Hello World
-Let us run a basic example with a reservoir sampling [4] of size 3.
+## Get started
+### Hello World
+Let us run a basic example with a reservoir sampling \[4] of size 3.
Save the following code in *testy.cpp*.
```cpp
#include //Included for cout
@@ -38,27 +44,82 @@ $ g++ -I./src -std=c++11 testy.cpp -o testy
$ ./testy
Hll
```
-## Use the library in your project
-Simply pick the code you need and add to your project. You also need to add
-the C++11 (`-std=c++11`) flag to your compilation toolchain.
+### Install
+#### Requirement
+As the collection is designed to run on embedded system without operating
+systems, OrpailleCC has very little dependencies and requirement.
+
+- Git : to download the repository.
+- C++ compiler with C++11: to compile OrpailleCC files.
+- googletest: to run unit tests.
+- Linux Operating System: because all instructions are given for Linux systems. However, OrpailleCC should compile properly on a Windows system as long as a C++ compiler is available.
+
+#### Installation
+To install OrpailleCC, first clone the repository.
+```bash
+git clone https://github.com/big-data-lab-team/OrpailleCC.git
+```
+In this example, we assume that OrpailleCC is located in
+`/usr/include/OrpailleCC`. Change it accordingly to your system.
+```bash
+ORPAILLECC_DIR=/usr/include/OrpailleCC
+```
+
+To use OrpailleCC in your project add `ORPAILLECC_DIR/src` in the include directories of the project.
+Let's assume the project is the hello world example, located in *~/hello/hello.cpp*.
+
+```cpp
+#include //Included for cout
+#include
+
+double randy(void){ //We need this function to provide a random number generator to ReservoirSampling.
+ return (double)rand() / (double)RAND_MAX; //On systems without rand, the programmer will have to define a pseudo-random function.
+}
+
+int main(){
+ char hello[] = "Hello-world!"; //Create a stream
+ ReservoirSampling rs; //Instantiate a ReservoirSampling instance
+ //This instance works with char, contains a reservoir of size 3 and use the randy function to generate random numbers.
+ for(int j = 0; j < 12; ++j) //Feed the ReservoirSampling instance with every element of the stream (here letters of the string)
+ rs.add(hello[j]);
+ for(int j = 0; j < 3; ++j) //Print every element in the reservoir
+ std::cout << rs[j];
+ std::cout << std::endl;
+ return 0;
+}
+```
-An alternative is to add `/src` to the include paths of your compiler.
+To compile this code (that use the ReservoirSampling object), you need to run the following commands.
+
+```bash
+cd ~/hello
+g++ -std=c++11 -I$ORPAILLECC_DIR hello.c
+```
-## Test
-### Unit Test
+### Test
+#### Unit Test
The unit tests require the `googletest` library ([Here](https://github.com/google/googletest)).
To run the unit tests, run the command `make run_test`.
-### Performance
+#### Performance
To run a performance test on your device, compile the performance tests with
`make perf` then run `./main-perf`.
![Alt](/figures/performance.png "An example of the performance output")
-# Examples
+#### Coverage
+To observe the coverage of test function, run the following commands:
+```bash
+make clean
+make config=debug coverage
+```
+These commands will clean previous object files to rebuild them with the debug options, then run the test and gather the data for the coverage.
+To visualize the test coverage, simply open *coverage/index.html* into your favorite browser.
+
+## Examples
This section provides the list of all algorithms implemented in OrpailleCC with a brief example.
-## Lightweight Temporal Compression (LTC)
-LTC [0] is a compression algorithm that approximates a series of values with a linear
+### Lightweight Temporal Compression (LTC)
+LTC \[0] is a compression algorithm that approximates a series of values with a linear
function. The epsilon parameter controls the amount of compression. If the
linear approximation isn't accurate enough, then a new point is
issued.
@@ -82,8 +143,8 @@ int main(){
}
}
```
-## Micro-Cluster Nearest Neighbour (MC-NN)
-MC-NN [3] is a classifier based on k-nearest neighbours. It aggregates the data
+### Micro-Cluster Nearest Neighbour (MC-NN)
+MC-NN \[3] is a classifier based on k-nearest neighbours. It aggregates the data
points into micro-clusters and make them evolve to catch concept drifts.
```cpp
@@ -125,9 +186,9 @@ int main(){
return 0;
}
```
-## Reservoir Sampling
+### Reservoir Sampling
The next example is the one used as a hello world example. A Reservoir
-Sample [4] is a fixed-sized sample of the stream where all elements have
+Sample \[4] is a fixed-sized sample of the stream where all elements have
equal probability to appear.
```cpp
#include //Included for cout
@@ -149,8 +210,8 @@ int main(){
return 0;
}
```
-## Chained Reservoir Sampling
-The chained reservoir sampling [1] is a variant of the reservoir sampling that allows discarding outdated data while maintaining the reservoir distribution.
+### Chained Reservoir Sampling
+The chained reservoir sampling \[1] is a variant of the reservoir sampling that allows discarding outdated data while maintaining the reservoir distribution.
```cpp
#include //Included for cout
@@ -189,8 +250,8 @@ int main(){
std::cout << std::endl;
}
```
-## Bloom Filter
-The Bloom filter [5] excludes elements from the stream when they don't belong to
+### Bloom Filter
+The Bloom filter \[5] excludes elements from the stream when they don't belong to
a pre-defined set.
```cpp
#include //Included for cout
@@ -224,8 +285,8 @@ int main(){
```
Note that, due to the Bloom Filter size, more than three elements will be recognized by the filter.
-## Cuckoo Filter
-The Cuckoo filter [2] is used when elements have to be removed from the pre-defined
+### Cuckoo Filter
+The Cuckoo filter \[2] is used when elements have to be removed from the pre-defined
set of accepted elements.
```cpp
#include //Included for cout
@@ -270,17 +331,17 @@ int main() {
}
```
-# How can I help?
+## How can I help?
- Report issues and seek support in the Issues tab.
- Write new examples or improve existing examples and share them with a pull request.
- Submit ideas for future algorithms to integrate.
- Submit pull requests with algorithm implementation.
- Submit pull requests with additional test cases.
-# References
-- [0] Schoellhammer, Tom and Greenstein, Ben and Osterweil, Eric and Wimbrow, Michael and Estrin, Deborah (2004), "Lightweight temporal compression of microclimate datasets"
-- [1] Babcock, Brian and Datar, Mayur and Motwani, Rajeev (2002), "Sampling from a moving window over streaming data", Proceedings of the thirteenth annual Association for Computing Machinery-SIAM symposium on Discrete algorithms, pages 633--634
-- [2] Fan, Bin and Andersen, Dave G and Kaminsky, Michael and Mitzenmacher, Michael (2014), "Cuckoo filter: Practically better than bloom", Proceedings of the 10th Association for Computing Machinery International on Conference on emerging Networking Experiments and Technologies, pages 75--88
-- [3] Tennant, Mark and Stahl, Frederic and Rana, Omer and Gomes, Joao Bartolo (2017), "Scalable real-time classification of data streams with concept drift", Future Generation Computer Systems, pages 187--199
-- [4] Vitter, Jeffrey S (1985), "Random sampling with a reservoir", Association for Computing Machinery Transactions on Mathematical Software (TOMS), pages 37--57
-- [5] Burton H. Bloom (1970), "Space/Time Trade-offs in Hash Coding with Allowable Errors", Communications of the Association for Computing Machinery
+## References
+- \[0] Schoellhammer, Tom and Greenstein, Ben and Osterweil, Eric and Wimbrow, Michael and Estrin, Deborah (2004), "Lightweight temporal compression of microclimate datasets"
+- \[1] Babcock, Brian and Datar, Mayur and Motwani, Rajeev (2002), "Sampling from a moving window over streaming data", Proceedings of the thirteenth annual Association for Computing Machinery-SIAM symposium on Discrete algorithms, pages 633--634
+- \[2] Fan, Bin and Andersen, Dave G and Kaminsky, Michael and Mitzenmacher, Michael (2014), "Cuckoo filter: Practically better than bloom", Proceedings of the 10th Association for Computing Machinery International on Conference on emerging Networking Experiments and Technologies, pages 75--88
+- \[3] Tennant, Mark and Stahl, Frederic and Rana, Omer and Gomes, Joao Bartolo (2017), "Scalable real-time classification of data streams with concept drift", Future Generation Computer Systems, pages 187--199
+- \[4] Vitter, Jeffrey S (1985), "Random sampling with a reservoir", Association for Computing Machinery Transactions on Mathematical Software (TOMS), pages 37--57
+- \[5] Burton H. Bloom (1970), "Space/Time Trade-offs in Hash Coding with Allowable Errors", Communications of the Association for Computing Machinery
diff --git a/src/bloom_filter.hpp b/src/bloom_filter.hpp
index c066a9b..081726c 100644
--- a/src/bloom_filter.hpp
+++ b/src/bloom_filter.hpp
@@ -91,12 +91,20 @@ class BloomFilter{
}
private:
+ /*
+ * Set a specific bit to 1 in the bit array.
+ * @param index the index of the bit to set to one.
+ */
void set_bit_to_one(int const index){
assert(index >= 0 && index < bit_size);
unsigned int const mod = index % BYTE_SIZE;
unsigned int const real_index = (index - mod) / BYTE_SIZE;
bits[real_index] = bits[real_index] | (1 << mod);
}
+ /*
+ * Access the value of a bit in the bit array.
+ * @param index the index of the bit to access.
+ */
int get_bit(int const index) const{
assert(index >= 0 && index < bit_size);
unsigned int const mod = index % BYTE_SIZE;
diff --git a/src/chained_reservoir.hpp b/src/chained_reservoir.hpp
index 0226f0b..590882a 100644
--- a/src/chained_reservoir.hpp
+++ b/src/chained_reservoir.hpp
@@ -9,14 +9,21 @@
*/
template
class ChainedReservoirSampling{
+ /*
+ * Internal structure for the node of the linked list.
+ */
struct node {
+ /*A pointer to the next element in the list*/
node* next;
+ /*The element store*/
element_type element;
+ /*The timestamp of the element so we can discard it when it gets obsolete.*/
unsigned int timestamp;
};
node sample[sample_size];
//`next` is not inside the node structure because once next is used for a node, it is not used again
unsigned int next[sample_size];
+ //The number of element seen so for by the reservoir.
unsigned int counter = 0;
/**
@@ -69,7 +76,12 @@ class ChainedReservoirSampling{
assert(false);
//TODO maybe clear the chain if it is corrupted
}
- void shift_chain(node& head, node& current){
+ /*
+ * Shift the node from current to head.
+ * @param head the to start replacing from.
+ * @param current the node that should replace head.
+ */
+ void shift_chain(node& head, node& current){//TODO improve this to avoid overwriting elements
node& receiver = head;
node& mover = current;
while(1){//TODO add security
@@ -99,7 +111,7 @@ class ChainedReservoirSampling{
current.timestamp = 0;
}
for(int i = 0; i < sample_size; ++i)
- next[i] = 4294967294; //`next` cannot be negative so to avoid 0 to be assign to all sample, I set it to the maximum
+ next[i] = 4294967294; //`next` cannot be negative so to avoid 0 to be assign to all sample, it is set to the maximum for an int
}
/**
* Sample one new element into the sample. This new element may not be added.
@@ -147,12 +159,9 @@ class ChainedReservoirSampling{
return sample[i].element;
}
/*
- * Declare a timestamp and all anterior timestamp obsolete
+ * Declare a timestamp and all anterior timestamp obsolete. All element with obsolete timestamp will be discarded.
* @param timestamp the timestamp to declare obsolete
*/
- /** Set a new obsolete timestamp. All element with a timestamp prior to this timestamp will be discarded.
- * @param timestamp a new obsolete timestamp.
- */
void obsolete(unsigned int const timestamp){
for(int i = 0; i < sample_size; ++i){
node& head = sample[i];
diff --git a/src/cuckoo_filter.hpp b/src/cuckoo_filter.hpp
index 12a1243..d4f3338 100644
--- a/src/cuckoo_filter.hpp
+++ b/src/cuckoo_filter.hpp
@@ -27,11 +27,20 @@ class CuckooFilter{
static unsigned int const total_size = ceil((double)bucket_count*bucket_size*entry_size / (double)BYTE_SIZE);
unsigned char filter[total_size] = {0};
+ /*
+ * Access a bit in the filter.
+ * @param bit_index the index of the bit to access in the entire filter.
+ */
unsigned char get_bit(unsigned int const bit_index) const{
unsigned int const mod = bit_index % BYTE_SIZE;
unsigned int const byte_index = (bit_index - mod) / BYTE_SIZE;
return (filter[byte_index] & (1 << mod) != 0);
}
+ /*
+ * Set a bit in the filter.
+ * @param bit_index the index of the bit to set in the entire filter.
+ * @param value the new value of the bit (0 or 1).
+ */
void set_bit(unsigned int const bit_index, unsigned int const value){
unsigned int const mod = bit_index % BYTE_SIZE;
unsigned int const byte_index = (bit_index - mod) / BYTE_SIZE;
@@ -40,6 +49,11 @@ class CuckooFilter{
else
filter[byte_index] = filter[byte_index] | (1 << mod);
}
+ /*
+ * Access a fingerprint in the filter.
+ * @param bucket_index the index of the bucket.
+ * @param entry_index the index of the fingerprint in that bucket.
+ */
fingerprint_t get_entry(unsigned int const bucket_index, unsigned int const entry_index) const{
assert(bucket_index >= 0 && bucket_index < bucket_count);
assert(bucket_size >= 0 && entry_index < bucket_size);
@@ -67,6 +81,12 @@ class CuckooFilter{
}
return tmp;
}
+ /*
+ * Set a fingerprint in the filter.
+ * @param bucket_index the index of the bucket.
+ * @param entry_index the index of the fingerprint in that bucket.
+ * @param fp the new value of the finger print.
+ */
void set_entry(unsigned int const bucket_index, unsigned int const entry_index, fingerprint_t const fp){
assert(bucket_index >= 0 && bucket_index < bucket_count);
assert(bucket_size >= 0 && entry_index < bucket_size);
@@ -85,15 +105,26 @@ class CuckooFilter{
set_bit(bi, value);
}
}
+ /*
+ * Compute the number of empty entry in a bucket.
+ * @param bucket_index the index of the bucket.
+ */
int space_in_bucket(unsigned int const bucket_index){
- fingerprint_t tmp;
for(int i = 0; i < bucket_size; ++i){
+ fingerprint_t tmp;
tmp = get_entry(bucket_index, i);
if(tmp == empty_value)
return i;
}
return -1; //return index available, -1 otherwise
}
+ /*
+ * Search an element in the filter.
+ * @param e the element to search for.
+ * @param bucket_index the bucket_index that contain the element (output).
+ * @param entry_index the index of the fingerprint in the bucket (output).
+ * @return true if the element is found.
+ */
bool search(element_type const* e, unsigned int& bucket_index, unsigned int& entry_index) const{
fingerprint_t fp = funct::fingerprint(e);
unsigned int h[2];
diff --git a/src/ltc.hpp b/src/ltc.hpp
index 868214d..8022fe4 100644
--- a/src/ltc.hpp
+++ b/src/ltc.hpp
@@ -9,6 +9,9 @@
*/
template
class LTC{
+ /*
+ * Internal structure to describe a data point.
+ */
struct data_point{
timestamp_type timestamp;
element_type value;
@@ -17,28 +20,45 @@ class LTC{
int counter = 0; //Counter make sure the algorithm does not fail for the first 3 values
+ /*
+ * Return the minimum between a and b.
+ */
template
static K min(K const a, K const b){
if(a > b)
return b;
return a;
}
+ /*
+ * Return the maximum between a and b.
+ */
template
static K max(K const a, K const b){
if(a < b)
return b;
return a;
}
+ /*
+ * Set the upper and lower limit.
+ */
void set_ul_and_ll(void){
UL = new_ul;
LL = new_ll;
}
+ /*
+ * Update the upper and lower limit based on a new data point.
+ * @param timestamp the timestamp of the data point.
+ * @param value The value of the data point.
+ */
void update(timestamp_type const timestamp, element_type const value){
new_ul.timestamp = timestamp;
new_ul.value = min(value + epsilon, max_value);
new_ll.timestamp = timestamp;
new_ll.value = max(value - epsilon, min_value);
}
+ /*
+ * Compute the need to transmit the compress data point.
+ */
bool need_transmit(void){
double old_up_deriva = (double)(UL.value - last_transmit_point.value) / (UL.timestamp - last_transmit_point.timestamp) / time_unit_difference;
double old_low_deriva = (double)(LL.value - last_transmit_point.value) / (LL.timestamp - last_transmit_point.timestamp) / time_unit_difference;
@@ -58,6 +78,9 @@ class LTC{
return false;
}
public:
+ /*
+ * Default constructor.
+ */
LTC(){
}
/**
diff --git a/src/mc_nn.hpp b/src/mc_nn.hpp
index 436d2c9..0c9d2e4 100644
--- a/src/mc_nn.hpp
+++ b/src/mc_nn.hpp
@@ -24,12 +24,21 @@ class MCNN{
int label;
int error_count;
double initial_timestamp;
+ /*
+ * Compute the variance for one feature.
+ * @param features_idx the index of the feature.
+ */
double variance(unsigned int const features_idx) const{
double const a = (double)features_square_sum[features_idx] / (double)data_count; //CF2X
double const b = (double)features_sum[features_idx] / (double)data_count; //CF1X
double const ret=a - b*b;
return ret;
}
+ /*
+ * Incorporate a data point into the micro-cluster.
+ * @param feature the data point.
+ * @param timestamp the timestamp at which the data point has been added.
+ */
void incorporate(feature_type const* features, double const timestamp){
timestamp_sum += timestamp;
timestamp_square_sum += timestamp * timestamp;
@@ -39,13 +48,23 @@ class MCNN{
features_square_sum[i] += features[i]*features[i];
}
}
+ /*
+ * Compute the performance (or the participation) of the micro-cluster at a specific timestamp.
+ * @param timestamp the current timestamp.
+ */
double performance(double const current_timestamp) const{
double const current_tn = triangular_number(current_timestamp);
double const initial_tn = triangular_number(initial_timestamp);
- double const real_tn = current_timestamp - initial_timestamp;
+ double const real_tn = current_tn - initial_tn;
double const participation = timestamp_sum * (100 / real_tn);
return participation;
}
+ /*
+ * Initialize a cluster.
+ * @param features the first data point added to the cluster.
+ * @param label the label of the cluster. (note, the label won't change for this cluster.)
+ * @param timestamp the timestamp of the cluster.
+ */
void initialize(feature_type const* features, int const label, double const timestamp){
initial_timestamp = timestamp;
timestamp_sum = timestamp;
@@ -58,11 +77,18 @@ class MCNN{
features_square_sum[i] = features[i]*features[i];
}
}
+ /*
+ * Compute the data point corresponding to the center of the micro-cluster.
+ * @param features the data point of the cluster (output).
+ */
void centroid(feature_type* features) const{
for(int i = 0; i < feature_count; ++i){
features[i] = (double)features_sum[i] / (double)data_count;
}
}
+ /*
+ * Overload of the = operator.
+ */
cluster& operator=(const cluster& other){
if(this != &other){
for(int i = 0; i < feature_count; ++i){
@@ -94,7 +120,10 @@ class MCNN{
return ((t*t + t) * 0.5);
}
- //Function to split a cluster
+ /*
+ * Function to split a cluster
+ * @param cluster_idx the index of the cluster to split.
+ */
void split(int const cluster_idx){
int new_idx = -1;
for(int i = 0; i < max_cluster; ++i){
@@ -105,6 +134,7 @@ class MCNN{
}
if(new_idx < 0){
//TODO what to do when there is no more space :]
+ //Remove the least performant cluster.
}
//Choose the attribute with the greatest variance, then do the split on it
@@ -136,7 +166,11 @@ class MCNN{
active[new_idx] = true;
count_active_cluster += 1;
}
- //Compute the squared distance between two data point
+ /*
+ * Compute the squared distance between two data point.
+ * @param e1 data point 1.
+ * @param e2 data point 2.
+ */
double euclidean_distance(feature_type const* e1, feature_type const* e2) const{
double squared_sum = 0;
for(int i = 0; i < feature_count; ++i)
@@ -144,7 +178,13 @@ class MCNN{
//NOTE: Not really a distance :D
return squared_sum;
}
- //Find the nearest cluster as well as the nearest cluster with the same class given a data point
+ /*
+ * Find the nearest cluster as well as the nearest cluster with the same class given a data point.
+ * @param features the data point features.
+ * @param label the label (or class) of the data point.
+ * @param nearest the nearest cluster of the data point (output).
+ * @param nearest_with_class the nearest cluster of the data point that has the same label as the data point (output).
+ */
void find_nearest_clusters(feature_type const* features, int const label, int& nearest, int& nearest_with_class) const{
//First find the nearest cluster
double distance_nearest = -1;
@@ -176,7 +216,12 @@ class MCNN{
if(shortest_distance == distance_nearest)
nearest = nearest_with_class;
}
- //Find the nearest cluster given a data point
+ /*
+ * Find the nearest cluster given a data point.
+ * @param features the data point features.
+ * @param nearest the nearest cluster of the data point (output).
+ * @param shortest if not null, contains the squared distance between the data point and the nearest cluster.
+ */
void find_nearest_clusters(feature_type const* features, int& nearest, double* shortest = nullptr) const{
feature_type centroid[feature_count];
int nearest_cluster = -1;
@@ -200,7 +245,10 @@ class MCNN{
if(shortest != nullptr)
*shortest = shortest_distance;
}
- //The sqrt implementation if needed.
+ /*
+ * The sqrt implementation if needed.
+ * @param val the number to compute the square root.
+ */
double sqrt_local(double const val) const{
return sqrt(val); //TODO: to change
}
diff --git a/src/reservoir_sampling.hpp b/src/reservoir_sampling.hpp
index 8b39f30..1a4a5f7 100644
--- a/src/reservoir_sampling.hpp
+++ b/src/reservoir_sampling.hpp
@@ -9,6 +9,7 @@
template
class ReservoirSampling{
element_type sample[sample_size];
+ //Count the number of element seen so far
unsigned int counter = 0;
/**
diff --git a/tests/test_mc_nn.cpp b/tests/test_mc_nn.cpp
index d0ecfa7..b8c4f78 100644
--- a/tests/test_mc_nn.cpp
+++ b/tests/test_mc_nn.cpp
@@ -11,6 +11,7 @@ TEST(MCNN, one) {
classifier.train(dataset, label);
auto predict = classifier.predict(dataset);
EXPECT_EQ (label, predict);
+ free(dataset);
}
TEST(MCNN, insert_once) {
/*
@@ -32,6 +33,7 @@ TEST(MCNN, insert_once) {
auto predict = classifier.predict(dataset + (i*4));
EXPECT_EQ (labels[i], predict);
}
+ free(dataset);
}
TEST(MCNN, same_class) {
MCNN mcnn();
@@ -49,6 +51,7 @@ TEST(MCNN, same_class) {
auto predict = classifier.predict(dataset + (i*4));
EXPECT_EQ (label, predict);
}
+ free(dataset);
}
TEST(MCNN, split) {
int dataset_size = 13;