diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..2b96286 --- /dev/null +++ b/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at info@pycm.ir. All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 0000000..929da11 --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,16 @@ +# Contribution + +Changes and improvements are more than welcome! ❤️ Feel free to fork and open a pull request. + + +Please consider the following : + + +1. Fork it! +2. Add your functions/methods to proper files +3. Add standard `docstring` to your functions/methods +4. Add tests for your functions/methods (`doctest` testcases in `Test` folder) +5. Pass all CI tests +6. Update `CHANGELOG.md` + - Describe changes under `[Unreleased]` section +7. Submit a pull request (please complete the pull request template) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md new file mode 100644 index 0000000..18991f1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE.md @@ -0,0 +1,12 @@ +#### Description + +#### Steps/Code to Reproduce + +#### Expected Behavior + +#### Actual Behavior + +#### Operating System + +#### Compiler Version + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..ded12d6 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,6 @@ +#### Reference Issues/PRs + +#### What does this implement/fix? Explain your changes. + +#### Any other comments? + diff --git a/AUTHORS.md b/AUTHORS.md new file mode 100644 index 0000000..823beee --- /dev/null +++ b/AUTHORS.md @@ -0,0 +1,4 @@ +# Authors # + +---------- +- Martin Khannouz - Concordia University ([m_khanno@encs.concordia.ca](mailto:m_khanno@encs.concordia.ca)) diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..5c63822 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,14 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] +### Added +- Reservoir Sampling +- Chained Reservoir Sampling +- Micro-Cluster Nearest Neighbour (MC-NN) +- Lightweight Temporal Compression (LTC) +- Bloom Filter +- Cuckoo Filter diff --git a/Makefile b/Makefile index 07e13d3..81c13d0 100644 --- a/Makefile +++ b/Makefile @@ -8,26 +8,40 @@ CPPOBJECT=$(TEST_DIR)/test_bloom.oo\ $(TEST_DIR)/test_chained_reservoir.oo\ $(TEST_DIR)/test_ltc.oo\ $(TEST_DIR)/test_mc_nn.oo -FLAGS=-g -FLAGS_PERF=-O3 + +FLAG_GCOV=-fprofile-arcs -ftest-coverage + +ifeq ($(config), debug) +CFLAGS=-DDEBUG -g -O0 $(FLAG_GCOV) +else #release config by default +CFLAGS=-Os -O3 +endif all: $(OBJECT) main.cpp - g++ -I$(SRC_DIR) -std=c++11 main.cpp $(OBJECT) $(FLAGS) -o $(EXE) + g++ -I$(SRC_DIR) -std=c++11 main.cpp $(OBJECT) $(CFLAGS) -o $(EXE) test: $(CPPOBJECT) $(TEST_DIR)/test.cpp - g++ -I$(SRC_DIR) -std=c++11 -fpermissive $(TEST_DIR)/test.cpp $(CPPOBJECT) $(FLAGS) -o $(EXE)-test -lgtest -lpthread + g++ -I$(SRC_DIR) -std=c++11 -fpermissive $(TEST_DIR)/test.cpp $(CPPOBJECT) $(CFLAGS) -o $(EXE)-test -lgtest -lpthread -lgcov perf: $(OBJECT) - g++ -I$(SRC_DIR) -std=c++11 main-performance.cpp $(OBJECT) $(FLAGS_PERF) -o $(EXE)-perf + g++ -I$(SRC_DIR) -std=c++11 main-performance.cpp $(OBJECT) $(CFLAGS) -o $(EXE)-perf run_test: test ./$(EXE)-test +coverage: run_test + mkdir coverage + gcov test + lcov -c --directory . --output-file coverage.info --no-external + genhtml coverage.info --output-directory coverage + %.o: %.c gcc -std=c99 $< -c -o $@ %.oo: %.cpp - g++ -I$(SRC_DIR) -std=c++11 $(FLAGS) $< -c -o $@ -fpermissive + g++ -I$(SRC_DIR) -std=c++11 $(CFLAGS) $< -c -o $@ -fpermissive clean: rm -f *.o *.oo $(TEST_DIR)/*.oo $(SRC_DIR)/*.oo $(EXE) $(EXE)-test $(EXE)-perf + rm -rf coverage + rm -f test.gcda test.gcno $(TEST_DIR)/*.gcda $(TEST_DIR)/*.gcno coverage.info diff --git a/README.md b/README.md index 8548ee2..1c4c32e 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,21 @@ -[![Build Status](https://travis-ci.org/azazel7/OrpailleCC.svg?branch=master)](https://travis-ci.org/azazel7/OrpailleCC) +
+
+

OrpailleCC

+ + +
OrpailleCC is data stream library written in C++. It provides a consistent -collection of data stream algorithms for embedded devices such as sensors. +collection of data stream algorithms for embedded devices. The goal of +OrpailleCC is to support research on data stream mining for connected objects, +by facilitating the comparison and benchmarking of algorithms in a consistent +framework. It also enables programmers of embedded systems to use +out-of-the-box algorithms with an efficient implementation. +Algorithms from OrpailleCC are based on C++ templates and does not use the STL library. -The library is based on C++ templates and does not use the STL library. To start -using a feature, just include the header files in your project and compile your -project. - -# Get started -## Hello World -Let us run a basic example with a reservoir sampling [4] of size 3. +## Get started +### Hello World +Let us run a basic example with a reservoir sampling \[4] of size 3. Save the following code in *testy.cpp*. ```cpp #include //Included for cout @@ -38,27 +44,82 @@ $ g++ -I./src -std=c++11 testy.cpp -o testy $ ./testy Hll ``` -## Use the library in your project -Simply pick the code you need and add to your project. You also need to add -the C++11 (`-std=c++11`) flag to your compilation toolchain. +### Install +#### Requirement +As the collection is designed to run on embedded system without operating +systems, OrpailleCC has very little dependencies and requirement. + +- Git : to download the repository. +- C++ compiler with C++11: to compile OrpailleCC files. +- googletest: to run unit tests. +- Linux Operating System: because all instructions are given for Linux systems. However, OrpailleCC should compile properly on a Windows system as long as a C++ compiler is available. + +#### Installation +To install OrpailleCC, first clone the repository. +```bash +git clone https://github.com/big-data-lab-team/OrpailleCC.git +``` +In this example, we assume that OrpailleCC is located in +`/usr/include/OrpailleCC`. Change it accordingly to your system. +```bash +ORPAILLECC_DIR=/usr/include/OrpailleCC +``` + +To use OrpailleCC in your project add `ORPAILLECC_DIR/src` in the include directories of the project. +Let's assume the project is the hello world example, located in *~/hello/hello.cpp*. + +```cpp +#include //Included for cout +#include + +double randy(void){ //We need this function to provide a random number generator to ReservoirSampling. + return (double)rand() / (double)RAND_MAX; //On systems without rand, the programmer will have to define a pseudo-random function. +} + +int main(){ + char hello[] = "Hello-world!"; //Create a stream + ReservoirSampling rs; //Instantiate a ReservoirSampling instance + //This instance works with char, contains a reservoir of size 3 and use the randy function to generate random numbers. + for(int j = 0; j < 12; ++j) //Feed the ReservoirSampling instance with every element of the stream (here letters of the string) + rs.add(hello[j]); + for(int j = 0; j < 3; ++j) //Print every element in the reservoir + std::cout << rs[j]; + std::cout << std::endl; + return 0; +} +``` -An alternative is to add `/src` to the include paths of your compiler. +To compile this code (that use the ReservoirSampling object), you need to run the following commands. + +```bash +cd ~/hello +g++ -std=c++11 -I$ORPAILLECC_DIR hello.c +``` -## Test -### Unit Test +### Test +#### Unit Test The unit tests require the `googletest` library ([Here](https://github.com/google/googletest)). To run the unit tests, run the command `make run_test`. -### Performance +#### Performance To run a performance test on your device, compile the performance tests with `make perf` then run `./main-perf`. ![Alt](/figures/performance.png "An example of the performance output") -# Examples +#### Coverage +To observe the coverage of test function, run the following commands: +```bash +make clean +make config=debug coverage +``` +These commands will clean previous object files to rebuild them with the debug options, then run the test and gather the data for the coverage. +To visualize the test coverage, simply open *coverage/index.html* into your favorite browser. + +## Examples This section provides the list of all algorithms implemented in OrpailleCC with a brief example. -## Lightweight Temporal Compression (LTC) -LTC [0] is a compression algorithm that approximates a series of values with a linear +### Lightweight Temporal Compression (LTC) +LTC \[0] is a compression algorithm that approximates a series of values with a linear function. The epsilon parameter controls the amount of compression. If the linear approximation isn't accurate enough, then a new point is issued. @@ -82,8 +143,8 @@ int main(){ } } ``` -## Micro-Cluster Nearest Neighbour (MC-NN) -MC-NN [3] is a classifier based on k-nearest neighbours. It aggregates the data +### Micro-Cluster Nearest Neighbour (MC-NN) +MC-NN \[3] is a classifier based on k-nearest neighbours. It aggregates the data points into micro-clusters and make them evolve to catch concept drifts. ```cpp @@ -125,9 +186,9 @@ int main(){ return 0; } ``` -## Reservoir Sampling +### Reservoir Sampling The next example is the one used as a hello world example. A Reservoir -Sample [4] is a fixed-sized sample of the stream where all elements have +Sample \[4] is a fixed-sized sample of the stream where all elements have equal probability to appear. ```cpp #include //Included for cout @@ -149,8 +210,8 @@ int main(){ return 0; } ``` -## Chained Reservoir Sampling -The chained reservoir sampling [1] is a variant of the reservoir sampling that allows discarding outdated data while maintaining the reservoir distribution. +### Chained Reservoir Sampling +The chained reservoir sampling \[1] is a variant of the reservoir sampling that allows discarding outdated data while maintaining the reservoir distribution. ```cpp #include //Included for cout @@ -189,8 +250,8 @@ int main(){ std::cout << std::endl; } ``` -## Bloom Filter -The Bloom filter [5] excludes elements from the stream when they don't belong to +### Bloom Filter +The Bloom filter \[5] excludes elements from the stream when they don't belong to a pre-defined set. ```cpp #include //Included for cout @@ -224,8 +285,8 @@ int main(){ ``` Note that, due to the Bloom Filter size, more than three elements will be recognized by the filter. -## Cuckoo Filter -The Cuckoo filter [2] is used when elements have to be removed from the pre-defined +### Cuckoo Filter +The Cuckoo filter \[2] is used when elements have to be removed from the pre-defined set of accepted elements. ```cpp #include //Included for cout @@ -270,17 +331,17 @@ int main() { } ``` -# How can I help? +## How can I help? - Report issues and seek support in the Issues tab. - Write new examples or improve existing examples and share them with a pull request. - Submit ideas for future algorithms to integrate. - Submit pull requests with algorithm implementation. - Submit pull requests with additional test cases. -# References -- [0] Schoellhammer, Tom and Greenstein, Ben and Osterweil, Eric and Wimbrow, Michael and Estrin, Deborah (2004), "Lightweight temporal compression of microclimate datasets" -- [1] Babcock, Brian and Datar, Mayur and Motwani, Rajeev (2002), "Sampling from a moving window over streaming data", Proceedings of the thirteenth annual Association for Computing Machinery-SIAM symposium on Discrete algorithms, pages 633--634 -- [2] Fan, Bin and Andersen, Dave G and Kaminsky, Michael and Mitzenmacher, Michael (2014), "Cuckoo filter: Practically better than bloom", Proceedings of the 10th Association for Computing Machinery International on Conference on emerging Networking Experiments and Technologies, pages 75--88 -- [3] Tennant, Mark and Stahl, Frederic and Rana, Omer and Gomes, Joao Bartolo (2017), "Scalable real-time classification of data streams with concept drift", Future Generation Computer Systems, pages 187--199 -- [4] Vitter, Jeffrey S (1985), "Random sampling with a reservoir", Association for Computing Machinery Transactions on Mathematical Software (TOMS), pages 37--57 -- [5] Burton H. Bloom (1970), "Space/Time Trade-offs in Hash Coding with Allowable Errors", Communications of the Association for Computing Machinery +## References +- \[0] Schoellhammer, Tom and Greenstein, Ben and Osterweil, Eric and Wimbrow, Michael and Estrin, Deborah (2004), "Lightweight temporal compression of microclimate datasets" +- \[1] Babcock, Brian and Datar, Mayur and Motwani, Rajeev (2002), "Sampling from a moving window over streaming data", Proceedings of the thirteenth annual Association for Computing Machinery-SIAM symposium on Discrete algorithms, pages 633--634 +- \[2] Fan, Bin and Andersen, Dave G and Kaminsky, Michael and Mitzenmacher, Michael (2014), "Cuckoo filter: Practically better than bloom", Proceedings of the 10th Association for Computing Machinery International on Conference on emerging Networking Experiments and Technologies, pages 75--88 +- \[3] Tennant, Mark and Stahl, Frederic and Rana, Omer and Gomes, Joao Bartolo (2017), "Scalable real-time classification of data streams with concept drift", Future Generation Computer Systems, pages 187--199 +- \[4] Vitter, Jeffrey S (1985), "Random sampling with a reservoir", Association for Computing Machinery Transactions on Mathematical Software (TOMS), pages 37--57 +- \[5] Burton H. Bloom (1970), "Space/Time Trade-offs in Hash Coding with Allowable Errors", Communications of the Association for Computing Machinery diff --git a/src/bloom_filter.hpp b/src/bloom_filter.hpp index c066a9b..081726c 100644 --- a/src/bloom_filter.hpp +++ b/src/bloom_filter.hpp @@ -91,12 +91,20 @@ class BloomFilter{ } private: + /* + * Set a specific bit to 1 in the bit array. + * @param index the index of the bit to set to one. + */ void set_bit_to_one(int const index){ assert(index >= 0 && index < bit_size); unsigned int const mod = index % BYTE_SIZE; unsigned int const real_index = (index - mod) / BYTE_SIZE; bits[real_index] = bits[real_index] | (1 << mod); } + /* + * Access the value of a bit in the bit array. + * @param index the index of the bit to access. + */ int get_bit(int const index) const{ assert(index >= 0 && index < bit_size); unsigned int const mod = index % BYTE_SIZE; diff --git a/src/chained_reservoir.hpp b/src/chained_reservoir.hpp index 0226f0b..590882a 100644 --- a/src/chained_reservoir.hpp +++ b/src/chained_reservoir.hpp @@ -9,14 +9,21 @@ */ template class ChainedReservoirSampling{ + /* + * Internal structure for the node of the linked list. + */ struct node { + /*A pointer to the next element in the list*/ node* next; + /*The element store*/ element_type element; + /*The timestamp of the element so we can discard it when it gets obsolete.*/ unsigned int timestamp; }; node sample[sample_size]; //`next` is not inside the node structure because once next is used for a node, it is not used again unsigned int next[sample_size]; + //The number of element seen so for by the reservoir. unsigned int counter = 0; /** @@ -69,7 +76,12 @@ class ChainedReservoirSampling{ assert(false); //TODO maybe clear the chain if it is corrupted } - void shift_chain(node& head, node& current){ + /* + * Shift the node from current to head. + * @param head the to start replacing from. + * @param current the node that should replace head. + */ + void shift_chain(node& head, node& current){//TODO improve this to avoid overwriting elements node& receiver = head; node& mover = current; while(1){//TODO add security @@ -99,7 +111,7 @@ class ChainedReservoirSampling{ current.timestamp = 0; } for(int i = 0; i < sample_size; ++i) - next[i] = 4294967294; //`next` cannot be negative so to avoid 0 to be assign to all sample, I set it to the maximum + next[i] = 4294967294; //`next` cannot be negative so to avoid 0 to be assign to all sample, it is set to the maximum for an int } /** * Sample one new element into the sample. This new element may not be added. @@ -147,12 +159,9 @@ class ChainedReservoirSampling{ return sample[i].element; } /* - * Declare a timestamp and all anterior timestamp obsolete + * Declare a timestamp and all anterior timestamp obsolete. All element with obsolete timestamp will be discarded. * @param timestamp the timestamp to declare obsolete */ - /** Set a new obsolete timestamp. All element with a timestamp prior to this timestamp will be discarded. - * @param timestamp a new obsolete timestamp. - */ void obsolete(unsigned int const timestamp){ for(int i = 0; i < sample_size; ++i){ node& head = sample[i]; diff --git a/src/cuckoo_filter.hpp b/src/cuckoo_filter.hpp index 12a1243..d4f3338 100644 --- a/src/cuckoo_filter.hpp +++ b/src/cuckoo_filter.hpp @@ -27,11 +27,20 @@ class CuckooFilter{ static unsigned int const total_size = ceil((double)bucket_count*bucket_size*entry_size / (double)BYTE_SIZE); unsigned char filter[total_size] = {0}; + /* + * Access a bit in the filter. + * @param bit_index the index of the bit to access in the entire filter. + */ unsigned char get_bit(unsigned int const bit_index) const{ unsigned int const mod = bit_index % BYTE_SIZE; unsigned int const byte_index = (bit_index - mod) / BYTE_SIZE; return (filter[byte_index] & (1 << mod) != 0); } + /* + * Set a bit in the filter. + * @param bit_index the index of the bit to set in the entire filter. + * @param value the new value of the bit (0 or 1). + */ void set_bit(unsigned int const bit_index, unsigned int const value){ unsigned int const mod = bit_index % BYTE_SIZE; unsigned int const byte_index = (bit_index - mod) / BYTE_SIZE; @@ -40,6 +49,11 @@ class CuckooFilter{ else filter[byte_index] = filter[byte_index] | (1 << mod); } + /* + * Access a fingerprint in the filter. + * @param bucket_index the index of the bucket. + * @param entry_index the index of the fingerprint in that bucket. + */ fingerprint_t get_entry(unsigned int const bucket_index, unsigned int const entry_index) const{ assert(bucket_index >= 0 && bucket_index < bucket_count); assert(bucket_size >= 0 && entry_index < bucket_size); @@ -67,6 +81,12 @@ class CuckooFilter{ } return tmp; } + /* + * Set a fingerprint in the filter. + * @param bucket_index the index of the bucket. + * @param entry_index the index of the fingerprint in that bucket. + * @param fp the new value of the finger print. + */ void set_entry(unsigned int const bucket_index, unsigned int const entry_index, fingerprint_t const fp){ assert(bucket_index >= 0 && bucket_index < bucket_count); assert(bucket_size >= 0 && entry_index < bucket_size); @@ -85,15 +105,26 @@ class CuckooFilter{ set_bit(bi, value); } } + /* + * Compute the number of empty entry in a bucket. + * @param bucket_index the index of the bucket. + */ int space_in_bucket(unsigned int const bucket_index){ - fingerprint_t tmp; for(int i = 0; i < bucket_size; ++i){ + fingerprint_t tmp; tmp = get_entry(bucket_index, i); if(tmp == empty_value) return i; } return -1; //return index available, -1 otherwise } + /* + * Search an element in the filter. + * @param e the element to search for. + * @param bucket_index the bucket_index that contain the element (output). + * @param entry_index the index of the fingerprint in the bucket (output). + * @return true if the element is found. + */ bool search(element_type const* e, unsigned int& bucket_index, unsigned int& entry_index) const{ fingerprint_t fp = funct::fingerprint(e); unsigned int h[2]; diff --git a/src/ltc.hpp b/src/ltc.hpp index 868214d..8022fe4 100644 --- a/src/ltc.hpp +++ b/src/ltc.hpp @@ -9,6 +9,9 @@ */ template class LTC{ + /* + * Internal structure to describe a data point. + */ struct data_point{ timestamp_type timestamp; element_type value; @@ -17,28 +20,45 @@ class LTC{ int counter = 0; //Counter make sure the algorithm does not fail for the first 3 values + /* + * Return the minimum between a and b. + */ template static K min(K const a, K const b){ if(a > b) return b; return a; } + /* + * Return the maximum between a and b. + */ template static K max(K const a, K const b){ if(a < b) return b; return a; } + /* + * Set the upper and lower limit. + */ void set_ul_and_ll(void){ UL = new_ul; LL = new_ll; } + /* + * Update the upper and lower limit based on a new data point. + * @param timestamp the timestamp of the data point. + * @param value The value of the data point. + */ void update(timestamp_type const timestamp, element_type const value){ new_ul.timestamp = timestamp; new_ul.value = min(value + epsilon, max_value); new_ll.timestamp = timestamp; new_ll.value = max(value - epsilon, min_value); } + /* + * Compute the need to transmit the compress data point. + */ bool need_transmit(void){ double old_up_deriva = (double)(UL.value - last_transmit_point.value) / (UL.timestamp - last_transmit_point.timestamp) / time_unit_difference; double old_low_deriva = (double)(LL.value - last_transmit_point.value) / (LL.timestamp - last_transmit_point.timestamp) / time_unit_difference; @@ -58,6 +78,9 @@ class LTC{ return false; } public: + /* + * Default constructor. + */ LTC(){ } /** diff --git a/src/mc_nn.hpp b/src/mc_nn.hpp index 436d2c9..0c9d2e4 100644 --- a/src/mc_nn.hpp +++ b/src/mc_nn.hpp @@ -24,12 +24,21 @@ class MCNN{ int label; int error_count; double initial_timestamp; + /* + * Compute the variance for one feature. + * @param features_idx the index of the feature. + */ double variance(unsigned int const features_idx) const{ double const a = (double)features_square_sum[features_idx] / (double)data_count; //CF2X double const b = (double)features_sum[features_idx] / (double)data_count; //CF1X double const ret=a - b*b; return ret; } + /* + * Incorporate a data point into the micro-cluster. + * @param feature the data point. + * @param timestamp the timestamp at which the data point has been added. + */ void incorporate(feature_type const* features, double const timestamp){ timestamp_sum += timestamp; timestamp_square_sum += timestamp * timestamp; @@ -39,13 +48,23 @@ class MCNN{ features_square_sum[i] += features[i]*features[i]; } } + /* + * Compute the performance (or the participation) of the micro-cluster at a specific timestamp. + * @param timestamp the current timestamp. + */ double performance(double const current_timestamp) const{ double const current_tn = triangular_number(current_timestamp); double const initial_tn = triangular_number(initial_timestamp); - double const real_tn = current_timestamp - initial_timestamp; + double const real_tn = current_tn - initial_tn; double const participation = timestamp_sum * (100 / real_tn); return participation; } + /* + * Initialize a cluster. + * @param features the first data point added to the cluster. + * @param label the label of the cluster. (note, the label won't change for this cluster.) + * @param timestamp the timestamp of the cluster. + */ void initialize(feature_type const* features, int const label, double const timestamp){ initial_timestamp = timestamp; timestamp_sum = timestamp; @@ -58,11 +77,18 @@ class MCNN{ features_square_sum[i] = features[i]*features[i]; } } + /* + * Compute the data point corresponding to the center of the micro-cluster. + * @param features the data point of the cluster (output). + */ void centroid(feature_type* features) const{ for(int i = 0; i < feature_count; ++i){ features[i] = (double)features_sum[i] / (double)data_count; } } + /* + * Overload of the = operator. + */ cluster& operator=(const cluster& other){ if(this != &other){ for(int i = 0; i < feature_count; ++i){ @@ -94,7 +120,10 @@ class MCNN{ return ((t*t + t) * 0.5); } - //Function to split a cluster + /* + * Function to split a cluster + * @param cluster_idx the index of the cluster to split. + */ void split(int const cluster_idx){ int new_idx = -1; for(int i = 0; i < max_cluster; ++i){ @@ -105,6 +134,7 @@ class MCNN{ } if(new_idx < 0){ //TODO what to do when there is no more space :] + //Remove the least performant cluster. } //Choose the attribute with the greatest variance, then do the split on it @@ -136,7 +166,11 @@ class MCNN{ active[new_idx] = true; count_active_cluster += 1; } - //Compute the squared distance between two data point + /* + * Compute the squared distance between two data point. + * @param e1 data point 1. + * @param e2 data point 2. + */ double euclidean_distance(feature_type const* e1, feature_type const* e2) const{ double squared_sum = 0; for(int i = 0; i < feature_count; ++i) @@ -144,7 +178,13 @@ class MCNN{ //NOTE: Not really a distance :D return squared_sum; } - //Find the nearest cluster as well as the nearest cluster with the same class given a data point + /* + * Find the nearest cluster as well as the nearest cluster with the same class given a data point. + * @param features the data point features. + * @param label the label (or class) of the data point. + * @param nearest the nearest cluster of the data point (output). + * @param nearest_with_class the nearest cluster of the data point that has the same label as the data point (output). + */ void find_nearest_clusters(feature_type const* features, int const label, int& nearest, int& nearest_with_class) const{ //First find the nearest cluster double distance_nearest = -1; @@ -176,7 +216,12 @@ class MCNN{ if(shortest_distance == distance_nearest) nearest = nearest_with_class; } - //Find the nearest cluster given a data point + /* + * Find the nearest cluster given a data point. + * @param features the data point features. + * @param nearest the nearest cluster of the data point (output). + * @param shortest if not null, contains the squared distance between the data point and the nearest cluster. + */ void find_nearest_clusters(feature_type const* features, int& nearest, double* shortest = nullptr) const{ feature_type centroid[feature_count]; int nearest_cluster = -1; @@ -200,7 +245,10 @@ class MCNN{ if(shortest != nullptr) *shortest = shortest_distance; } - //The sqrt implementation if needed. + /* + * The sqrt implementation if needed. + * @param val the number to compute the square root. + */ double sqrt_local(double const val) const{ return sqrt(val); //TODO: to change } diff --git a/src/reservoir_sampling.hpp b/src/reservoir_sampling.hpp index 8b39f30..1a4a5f7 100644 --- a/src/reservoir_sampling.hpp +++ b/src/reservoir_sampling.hpp @@ -9,6 +9,7 @@ template class ReservoirSampling{ element_type sample[sample_size]; + //Count the number of element seen so far unsigned int counter = 0; /** diff --git a/tests/test_mc_nn.cpp b/tests/test_mc_nn.cpp index d0ecfa7..b8c4f78 100644 --- a/tests/test_mc_nn.cpp +++ b/tests/test_mc_nn.cpp @@ -11,6 +11,7 @@ TEST(MCNN, one) { classifier.train(dataset, label); auto predict = classifier.predict(dataset); EXPECT_EQ (label, predict); + free(dataset); } TEST(MCNN, insert_once) { /* @@ -32,6 +33,7 @@ TEST(MCNN, insert_once) { auto predict = classifier.predict(dataset + (i*4)); EXPECT_EQ (labels[i], predict); } + free(dataset); } TEST(MCNN, same_class) { MCNN mcnn(); @@ -49,6 +51,7 @@ TEST(MCNN, same_class) { auto predict = classifier.predict(dataset + (i*4)); EXPECT_EQ (label, predict); } + free(dataset); } TEST(MCNN, split) { int dataset_size = 13;