From 9624612a59bfda40a3f0d466a895868da953a84a Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 6 Dec 2023 20:09:37 +0100 Subject: [PATCH 01/21] Change kmer_t into template --- CMakeLists.txt | 8 -- include/bit_vector_iterator.hpp | 1 + include/buckets.hpp | 15 +-- include/builder/{build.cpp => build.impl} | 6 +- include/builder/build_index.hpp | 3 +- include/builder/build_skew_index.hpp | 5 +- include/builder/parse_file.hpp | 25 ++--- include/builder/util.hpp | 3 +- include/constants.hpp | 4 +- include/dictionary.hpp | 24 ++++- include/{dictionary.cpp => dictionary.impl} | 91 +++++++++++-------- include/{dump.cpp => dump.impl} | 7 +- include/hash_util.hpp | 17 ++-- include/{info.cpp => info.impl} | 29 +----- include/kmer.hpp | 3 +- include/minimizer_enumerator.hpp | 9 +- include/query/streaming_query.hpp | 39 ++++---- .../streaming_query_canonical_parsing.hpp | 25 ++--- .../query/streaming_query_regular_parsing.hpp | 25 ++--- include/skew_index.hpp | 25 ++++- include/{statistics.cpp => statistics.impl} | 6 +- include/util.hpp | 11 ++- src/bench_utils.hpp | 9 +- src/check_utils.hpp | 33 ++++--- src/common.hpp | 3 +- src/sshash.cpp | 10 +- test/test_alphabet.cpp | 22 +++-- 27 files changed, 258 insertions(+), 200 deletions(-) rename include/builder/{build.cpp => build.impl} (95%) rename include/{dictionary.cpp => dictionary.impl} (66%) rename include/{dump.cpp => dump.impl} (94%) rename include/{info.cpp => info.impl} (70%) rename include/{statistics.cpp => statistics.impl} (93%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e92a0d..81c5e78 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -46,18 +46,10 @@ set(Z_LIB_SOURCES include/gz/zip_stream.cpp ) -set(SSHASH_SOURCES - include/dictionary.cpp - include/info.cpp - include/dump.cpp - include/statistics.cpp - include/builder/build.cpp -) # Create a static lib add_library(sshash_static STATIC ${Z_LIB_SOURCES} - ${SSHASH_SOURCES} ) add_executable(sshash src/sshash.cpp) diff --git a/include/bit_vector_iterator.hpp b/include/bit_vector_iterator.hpp index 3ed5aad..0393cbe 100644 --- a/include/bit_vector_iterator.hpp +++ b/include/bit_vector_iterator.hpp @@ -5,6 +5,7 @@ namespace sshash { +template struct bit_vector_iterator { bit_vector_iterator() : m_bv(nullptr) {} diff --git a/include/buckets.hpp b/include/buckets.hpp index 7da1478..f076ec1 100644 --- a/include/buckets.hpp +++ b/include/buckets.hpp @@ -6,6 +6,7 @@ namespace sshash { +template struct buckets { std::pair offset_to_id(uint64_t offset, uint64_t k) const { auto [pos, contig_begin, contig_end] = pieces.locate(offset); @@ -52,13 +53,13 @@ struct buckets { kmer_t contig_prefix(uint64_t contig_id, uint64_t k) const { uint64_t contig_begin = pieces.access(contig_id); - bit_vector_iterator bv_it(strings, 2 * contig_begin); + bit_vector_iterator bv_it(strings, 2 * contig_begin); return bv_it.read(2 * (k - 1)); } kmer_t contig_suffix(uint64_t contig_id, uint64_t k) const { uint64_t contig_end = pieces.access(contig_id + 1); - bit_vector_iterator bv_it(strings, 2 * (contig_end - k + 1)); + bit_vector_iterator bv_it(strings, 2 * (contig_end - k + 1)); return bv_it.read(2 * (k - 1)); } @@ -90,7 +91,7 @@ struct buckets { uint64_t m) const { uint64_t offset = offsets.access(super_kmer_id); auto [res, contig_end] = offset_to_id(offset, k); - bit_vector_iterator bv_it(strings, 2 * offset); + bit_vector_iterator bv_it(strings, 2 * offset); uint64_t window_size = std::min(k - m + 1, contig_end - offset - k + 1); for (uint64_t w = 0; w != window_size; ++w) { kmer_t read_kmer = bv_it.read_and_advance_by_two(2 * k); @@ -115,7 +116,7 @@ struct buckets { for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { uint64_t offset = offsets.access(super_kmer_id); auto [res, contig_end] = offset_to_id(offset, k); - bit_vector_iterator bv_it(strings, 2 * offset); + bit_vector_iterator bv_it(strings, 2 * offset); uint64_t window_size = std::min(k - m + 1, contig_end - offset - k + 1); for (uint64_t w = 0; w != window_size; ++w) { kmer_t read_kmer = bv_it.read_and_advance_by_two(2 * k); @@ -166,7 +167,7 @@ struct buckets { void access(uint64_t kmer_id, char* string_kmer, uint64_t k) const { uint64_t offset = id_to_offset(kmer_id, k); - bit_vector_iterator bv_it(strings, 2 * offset); + bit_vector_iterator bv_it(strings, 2 * offset); kmer_t read_kmer = bv_it.read(2 * k); util::uint_kmer_to_string(read_kmer, string_kmer, k); } @@ -176,7 +177,7 @@ struct buckets { iterator(buckets const* ptr, uint64_t kmer_id, uint64_t k, uint64_t num_kmers) : m_buckets(ptr), m_kmer_id(kmer_id), m_k(k), m_num_kmers(num_kmers) { - bv_it = bit_vector_iterator(m_buckets->strings, -1); + bv_it = bit_vector_iterator(m_buckets->strings, -1); offset = m_buckets->id_to_offset(m_kmer_id, k); auto [pos, piece_end] = m_buckets->pieces.next_geq(offset); if (piece_end == offset) pos += 1; @@ -219,7 +220,7 @@ struct buckets { uint64_t m_kmer_id, m_k, m_num_kmers; uint64_t offset; uint64_t next_offset; - bit_vector_iterator bv_it; + bit_vector_iterator bv_it; ef_sequence::iterator pieces_it; kmer_t read_kmer; diff --git a/include/builder/build.cpp b/include/builder/build.impl similarity index 95% rename from include/builder/build.cpp rename to include/builder/build.impl index e34992c..d4c5f63 100644 --- a/include/builder/build.cpp +++ b/include/builder/build.impl @@ -1,4 +1,3 @@ -#include "../dictionary.hpp" #include "../../external/pthash/external/essentials/include/essentials.hpp" #include "util.hpp" @@ -12,7 +11,8 @@ namespace sshash { -void dictionary::build(std::string const& filename, build_configuration const& build_config) { +template +void dictionary::build(std::string const& filename, build_configuration const& build_config) { /* Validate the build configuration. */ if (build_config.k == 0) throw std::runtime_error("k must be > 0"); if (build_config.k > constants::max_k) { @@ -41,7 +41,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b /* step 1: parse the input file and build compact string pool ***/ timer.start(); - parse_data data = parse_file(filename, build_config); + parse_data data = parse_file(filename, build_config); m_size = data.num_kmers; timer.stop(); timings.push_back(timer.elapsed()); diff --git a/include/builder/build_index.hpp b/include/builder/build_index.hpp index 4878390..6c90c5f 100644 --- a/include/builder/build_index.hpp +++ b/include/builder/build_index.hpp @@ -155,7 +155,8 @@ struct bucket_pairs { } }; -buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, buckets& m_buckets, +template +buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, buckets& m_buckets, build_configuration const& build_config) { uint64_t num_buckets = m_minimizers.size(); uint64_t num_kmers = data.num_kmers; diff --git a/include/builder/build_skew_index.hpp b/include/builder/build_skew_index.hpp index febe43e..2bb06fd 100644 --- a/include/builder/build_skew_index.hpp +++ b/include/builder/build_skew_index.hpp @@ -4,7 +4,8 @@ namespace sshash { -void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& m_buckets, +template +void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& m_buckets, build_configuration const& build_config, buckets_statistics const& buckets_stats) { const uint64_t min_log2_size = m_skew_index.min_log2; @@ -202,7 +203,7 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& assert(lists[i].size() > lower and lists[i].size() <= upper); uint64_t super_kmer_id = 0; for (auto [offset, num_kmers_in_super_kmer] : lists[i]) { - bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); + bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); for (uint64_t i = 0; i != num_kmers_in_super_kmer; ++i) { kmer_t kmer = bv_it.read(2 * build_config.k); keys_in_partition.push_back(kmer); diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index 66a63fd..59eab93 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -4,15 +4,17 @@ namespace sshash { +template struct parse_data { parse_data(std::string const& tmp_dirname) : num_kmers(0), minimizers(tmp_dirname) {} uint64_t num_kmers; minimizers_tuples minimizers; - compact_string_pool strings; + compact_string_pool strings; weights::builder weights_builder; }; -void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) { +template +void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) { uint64_t k = build_config.k; uint64_t m = build_config.m; uint64_t seed = build_config.seed; @@ -29,7 +31,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b /* fit into the wanted number of bits */ assert(max_num_kmers_in_super_kmer < (1ULL << (sizeof(num_kmers_in_super_kmer_uint_type) * 8))); - compact_string_pool::builder builder(k); + typename compact_string_pool::builder builder(k); std::string sequence; uint64_t prev_minimizer = constants::invalid_uint64; @@ -161,12 +163,12 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b while (end != sequence.size() - k + 1) { char const* kmer = sequence.data() + end; assert(util::is_valid(kmer, k)); - kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k); - uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed); + kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k); + uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed); if (build_config.canonical_parsing) { - kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, k); - uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); + kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, k); + uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); minimizer = std::min(minimizer, minimizer_rc); } @@ -205,16 +207,17 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b } } -parse_data parse_file(std::string const& filename, build_configuration const& build_config) { +template +parse_data parse_file(std::string const& filename, build_configuration const& build_config) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); std::cout << "reading file '" << filename << "'..." << std::endl; - parse_data data(build_config.tmp_dirname); + parse_data data(build_config.tmp_dirname); if (util::ends_with(filename, ".gz")) { zip_istream zis(is); - parse_file(zis, data, build_config); + parse_file(zis, data, build_config); } else { - parse_file(is, data, build_config); + parse_file(is, data, build_config); } is.close(); return data; diff --git a/include/builder/util.hpp b/include/builder/util.hpp index 54527f9..aea32ca 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -26,6 +26,7 @@ struct parse_runtime_error : public std::runtime_error { } } +template struct compact_string_pool { compact_string_pool() {} @@ -49,7 +50,7 @@ struct compact_string_pool { pieces.push_back(bvb_strings.size() / 2); } for (uint64_t i = prefix; i != size; ++i) { - bvb_strings.append_bits(util::char_to_uint(string[i]), 2); + bvb_strings.append_bits(util::char_to_uint(string[i]), 2); } num_super_kmers += 1; offset = bvb_strings.size() / 2; diff --git a/include/constants.hpp b/include/constants.hpp index 946b0cf..f72b815 100644 --- a/include/constants.hpp +++ b/include/constants.hpp @@ -4,10 +4,10 @@ namespace sshash::constants { -constexpr uint64_t uint_kmer_bits = sizeof(kmer_t) * 8; +constexpr uint64_t uint_kmer_bits = sizeof(default_kmer_t) * 8; /* max *odd* size that can be packed into sizeof(kmer_t)*8 bits */ -constexpr uint64_t max_k = sizeof(kmer_t) * 4 - 1; +constexpr uint64_t max_k = sizeof(default_kmer_t) * 4 - 1; /* max *odd* size that can be packed into 64 bits */ constexpr uint64_t max_m = 31; diff --git a/include/dictionary.hpp b/include/dictionary.hpp index 0bf0b6e..f546819 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -8,6 +8,14 @@ namespace sshash { +// Forward declarations of the friend template classes +template +struct streaming_query_canonical_parsing; + +template +struct streaming_query_regular_parsing; + +template struct dictionary { dictionary() : m_size(0), m_seed(0), m_k(0), m_m(0), m_canonical_parsing(0) {} @@ -61,8 +69,8 @@ struct dictionary { bool is_member_uint(kmer_t uint_kmer, bool check_reverse_complement = true) const; /* Streaming queries. */ - friend struct streaming_query_canonical_parsing; - friend struct streaming_query_regular_parsing; + friend struct streaming_query_canonical_parsing; + friend struct streaming_query_regular_parsing; streaming_query_report streaming_query_from_file(std::string const& filename, bool multiline) const; @@ -75,7 +83,7 @@ struct dictionary { std::pair next() { return it.next(); } private: - typename buckets::iterator it; + typename buckets::iterator it; }; iterator begin() const { return iterator(this); } @@ -112,8 +120,8 @@ struct dictionary { uint16_t m_m; uint16_t m_canonical_parsing; minimizers m_minimizers; - buckets m_buckets; - skew_index m_skew_index; + buckets m_buckets; + skew_index m_skew_index; weights m_weights; lookup_result lookup_uint_regular_parsing(kmer_t uint_kmer) const; @@ -123,3 +131,9 @@ struct dictionary { }; } // namespace sshash + +#include "builder/build.impl" +#include "dictionary.impl" +#include "dump.impl" +#include "info.impl" +#include "statistics.impl" \ No newline at end of file diff --git a/include/dictionary.cpp b/include/dictionary.impl similarity index 66% rename from include/dictionary.cpp rename to include/dictionary.impl index ce5f209..b8e83cf 100644 --- a/include/dictionary.cpp +++ b/include/dictionary.impl @@ -1,8 +1,7 @@ -#include "dictionary.hpp" - namespace sshash { -lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) const { +template +lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) const { uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); uint64_t bucket_id = m_minimizers.lookup(minimizer); @@ -23,7 +22,8 @@ lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) const { return m_buckets.lookup(begin, end, uint_kmer, m_k, m_m); } -lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const { +template +lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const { kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, m_k); uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, m_k, m_m, m_seed); @@ -55,21 +55,25 @@ lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const return m_buckets.lookup_canonical(begin, end, uint_kmer, uint_kmer_rc, m_k, m_m); } -uint64_t dictionary::lookup(char const* string_kmer, bool check_reverse_complement) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); +template +uint64_t dictionary::lookup(char const* string_kmer, bool check_reverse_complement) const { + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return lookup_uint(uint_kmer, check_reverse_complement); } -uint64_t dictionary::lookup_uint(kmer_t uint_kmer, bool check_reverse_complement) const { +template +uint64_t dictionary::lookup_uint(kmer_t uint_kmer, bool check_reverse_complement) const { auto res = lookup_advanced_uint(uint_kmer, check_reverse_complement); return res.kmer_id; } -lookup_result dictionary::lookup_advanced(char const* string_kmer, +template +lookup_result dictionary::lookup_advanced(char const* string_kmer, bool check_reverse_complement) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return lookup_advanced_uint(uint_kmer, check_reverse_complement); } -lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, +template +lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, bool check_reverse_complement) const { if (m_canonical_parsing) return lookup_uint_canonical_parsing(uint_kmer); auto res = lookup_uint_regular_parsing(uint_kmer); @@ -82,70 +86,83 @@ lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, return res; } -bool dictionary::is_member(char const* string_kmer, bool check_reverse_complement) const { +template +bool dictionary::is_member(char const* string_kmer, bool check_reverse_complement) const { return lookup(string_kmer, check_reverse_complement) != constants::invalid_uint64; } -bool dictionary::is_member_uint(kmer_t uint_kmer, bool check_reverse_complement) const { +template +bool dictionary::is_member_uint(kmer_t uint_kmer, bool check_reverse_complement) const { return lookup_uint(uint_kmer, check_reverse_complement) != constants::invalid_uint64; } -void dictionary::access(uint64_t kmer_id, char* string_kmer) const { +template +void dictionary::access(uint64_t kmer_id, char* string_kmer) const { assert(kmer_id < size()); m_buckets.access(kmer_id, string_kmer, m_k); } -uint64_t dictionary::weight(uint64_t kmer_id) const { +template +uint64_t dictionary::weight(uint64_t kmer_id) const { assert(kmer_id < size()); return m_weights.weight(kmer_id); } -uint64_t dictionary::contig_size(uint64_t contig_id) const { +template +uint64_t dictionary::contig_size(uint64_t contig_id) const { assert(contig_id < num_contigs()); uint64_t contig_length = m_buckets.contig_length(contig_id); assert(contig_length >= m_k); return contig_length - m_k + 1; } -void dictionary::forward_neighbours(kmer_t suffix, neighbourhood& res) const { - res.forward_A = lookup_advanced_uint(suffix + (util::char_to_uint('A') << (2 * (m_k - 1)))); - res.forward_C = lookup_advanced_uint(suffix + (util::char_to_uint('C') << (2 * (m_k - 1)))); - res.forward_G = lookup_advanced_uint(suffix + (util::char_to_uint('G') << (2 * (m_k - 1)))); - res.forward_T = lookup_advanced_uint(suffix + (util::char_to_uint('T') << (2 * (m_k - 1)))); +template +void dictionary::forward_neighbours(kmer_t suffix, neighbourhood& res) const { + res.forward_A = lookup_advanced_uint(suffix + (util::char_to_uint('A') << (2 * (m_k - 1)))); + res.forward_C = lookup_advanced_uint(suffix + (util::char_to_uint('C') << (2 * (m_k - 1)))); + res.forward_G = lookup_advanced_uint(suffix + (util::char_to_uint('G') << (2 * (m_k - 1)))); + res.forward_T = lookup_advanced_uint(suffix + (util::char_to_uint('T') << (2 * (m_k - 1)))); } -void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res) const { - res.backward_A = lookup_advanced_uint(prefix + util::char_to_uint('A')); - res.backward_C = lookup_advanced_uint(prefix + util::char_to_uint('C')); - res.backward_G = lookup_advanced_uint(prefix + util::char_to_uint('G')); - res.backward_T = lookup_advanced_uint(prefix + util::char_to_uint('T')); +template +void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res) const { + res.backward_A = lookup_advanced_uint(prefix + util::char_to_uint('A')); + res.backward_C = lookup_advanced_uint(prefix + util::char_to_uint('C')); + res.backward_G = lookup_advanced_uint(prefix + util::char_to_uint('G')); + res.backward_T = lookup_advanced_uint(prefix + util::char_to_uint('T')); } -neighbourhood dictionary::kmer_forward_neighbours(char const* string_kmer) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); +template +neighbourhood dictionary::kmer_forward_neighbours(char const* string_kmer) const { + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return kmer_forward_neighbours(uint_kmer); } -neighbourhood dictionary::kmer_forward_neighbours(kmer_t uint_kmer) const { +template +neighbourhood dictionary::kmer_forward_neighbours(kmer_t uint_kmer) const { neighbourhood res; kmer_t suffix = uint_kmer >> 2; forward_neighbours(suffix, res); return res; } -neighbourhood dictionary::kmer_backward_neighbours(char const* string_kmer) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); +template +neighbourhood dictionary::kmer_backward_neighbours(char const* string_kmer) const { + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return kmer_backward_neighbours(uint_kmer); } -neighbourhood dictionary::kmer_backward_neighbours(kmer_t uint_kmer) const { +template +neighbourhood dictionary::kmer_backward_neighbours(kmer_t uint_kmer) const { neighbourhood res; kmer_t prefix = (uint_kmer << 2) & ((kmer_t(1) << (2 * m_k)) - 1); backward_neighbours(prefix, res); return res; } -neighbourhood dictionary::kmer_neighbours(char const* string_kmer) const { - kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); +template +neighbourhood dictionary::kmer_neighbours(char const* string_kmer) const { + kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return kmer_neighbours(uint_kmer); } -neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer) const { +template +neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer) const { neighbourhood res; kmer_t suffix = uint_kmer >> 2; forward_neighbours(suffix, res); @@ -154,7 +171,8 @@ neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer) const { return res; } -neighbourhood dictionary::contig_neighbours(uint64_t contig_id) const { +template +neighbourhood dictionary::contig_neighbours(uint64_t contig_id) const { assert(contig_id < num_contigs()); neighbourhood res; kmer_t suffix = m_buckets.contig_suffix(contig_id, m_k); @@ -164,7 +182,8 @@ neighbourhood dictionary::contig_neighbours(uint64_t contig_id) const { return res; } -uint64_t dictionary::num_bits() const { +template +uint64_t dictionary::num_bits() const { return 8 * (sizeof(m_size) + sizeof(m_seed) + sizeof(m_k) + sizeof(m_m) + sizeof(m_canonical_parsing)) + m_minimizers.num_bits() + m_buckets.num_bits() + m_skew_index.num_bits() + diff --git a/include/dump.cpp b/include/dump.impl similarity index 94% rename from include/dump.cpp rename to include/dump.impl index 4ead9b7..8c7d005 100644 --- a/include/dump.cpp +++ b/include/dump.impl @@ -1,8 +1,7 @@ -#include "dictionary.hpp" - namespace sshash { -void dictionary::dump(std::string const& filename) const { +template +void dictionary::dump(std::string const& filename) const { uint64_t num_kmers = size(); uint64_t num_minimizers = m_minimizers.size(); uint64_t num_super_kmers = m_buckets.offsets.size(); @@ -24,7 +23,7 @@ void dictionary::dump(std::string const& filename) const { uint64_t offset = m_buckets.offsets.access(super_kmer_id); auto [_, contig_end] = m_buckets.offset_to_id(offset, m_k); (void)_; - bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); + bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); uint64_t window_size = std::min(m_k - m_m + 1, contig_end - offset - m_k + 1); uint64_t prev_minimizer = constants::invalid_uint64; bool super_kmer_header_written = false; diff --git a/include/hash_util.hpp b/include/hash_util.hpp index b1090da..5685104 100644 --- a/include/hash_util.hpp +++ b/include/hash_util.hpp @@ -5,6 +5,7 @@ namespace sshash { +template struct kmers_pthash_hasher_64 { typedef pthash::hash64 hash_type; @@ -24,6 +25,7 @@ struct kmers_pthash_hasher_64 { } }; +template struct kmers_pthash_hasher_128 { typedef pthash::hash128 hash_type; @@ -50,8 +52,8 @@ struct kmers_pthash_hasher_128 { // typedef pthash::murmurhash2_64 minimizers_base_hasher_type; typedef pthash::murmurhash2_128 minimizers_base_hasher_type; -// typedef kmers_pthash_hasher_64 kmers_base_hasher_type; -typedef kmers_pthash_hasher_128 kmers_base_hasher_type; +template +using kmers_base_hasher_type = kmers_pthash_hasher_128; typedef pthash::single_phf minimizers_pthash_type; -typedef pthash::single_phf - kmers_pthash_type; +template +using kmers_pthash_type = pthash::single_phf< + kmers_base_hasher_type, // base hasher + pthash::dictionary_dictionary, // encoder type + true>; // minimal output + /* used to hash m-mers and determine the minimizer of a k-mer */ struct murmurhash2_64 { diff --git a/include/info.cpp b/include/info.impl similarity index 70% rename from include/info.cpp rename to include/info.impl index 71de6d4..649097f 100644 --- a/include/info.cpp +++ b/include/info.impl @@ -1,29 +1,7 @@ -#include "dictionary.hpp" -#include "skew_index.hpp" - namespace sshash { -uint64_t skew_index::print_info() const { - uint64_t num_partitions = mphfs.size(); - uint64_t lower = 1ULL << min_log2; - uint64_t upper = 2 * lower; - uint64_t num_kmers_in_skew_index = 0; - for (uint64_t partition_id = 0; partition_id != num_partitions; ++partition_id) { - uint64_t n = mphfs[partition_id].num_keys(); - assert(n == positions[partition_id].size()); - std::cout << "num_kmers belonging to buckets of size > " << lower << " and <= " << upper - << ": " << n << "; "; - std::cout << "bits/kmer = " << static_cast(mphfs[partition_id].num_bits()) / n - << " (mphf) + " << (positions[partition_id].bytes() * 8.0) / n - << " (positions)\n"; - num_kmers_in_skew_index += n; - lower = upper; - upper = 2 * lower; - } - return num_kmers_in_skew_index; -} - -void dictionary::print_space_breakdown() const { +template +void dictionary::print_space_breakdown() const { std::cout << "total index size: " << essentials::convert((num_bits() + 7) / 8, essentials::MB) << " [MB]" << '\n'; std::cout << "SPACE BREAKDOWN:\n"; @@ -48,7 +26,8 @@ void dictionary::print_space_breakdown() const { << std::endl; } -void dictionary::print_info() const { +template +void dictionary::print_info() const { std::cout << "=== dictionary info:\n"; std::cout << "num_kmers = " << size() << '\n'; std::cout << "k = " << k() << '\n'; diff --git a/include/kmer.hpp b/include/kmer.hpp index 79fcdb7..c405828 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -2,7 +2,8 @@ namespace sshash { -typedef uint64_t kmer_t; +using default_kmer_t = uint64_t; +// typedef uint64_t kmer_t; // typedef __uint128_t kmer_t; } // namespace sshash \ No newline at end of file diff --git a/include/minimizer_enumerator.hpp b/include/minimizer_enumerator.hpp index aec73f8..1683794 100644 --- a/include/minimizer_enumerator.hpp +++ b/include/minimizer_enumerator.hpp @@ -48,7 +48,7 @@ struct fixed_size_deque { std::vector m_buffer; }; -template +template struct minimizer_enumerator { minimizer_enumerator() {} @@ -61,10 +61,9 @@ struct minimizer_enumerator { , m_q(k - m + 1) /* deque cannot contain more than k - m + 1 elements */ {} - template - uint64_t next(kmer_t kmer, bool clear) { + uint64_t next(kmer_t kmer, bool clear, bool reverse = false) { if (clear) { - if constexpr (reverse) { + if (reverse) { for (uint64_t i = 0; i != m_k - m_m + 1; ++i) { uint64_t mmer = static_cast((kmer >> (2 * (m_k - m_m - i))) & m_mask); eat(mmer); @@ -77,7 +76,7 @@ struct minimizer_enumerator { } } } else { - if constexpr (reverse) { + if (reverse) { uint64_t mmer = static_cast(kmer & m_mask); eat(mmer); } else { diff --git a/include/query/streaming_query.hpp b/include/query/streaming_query.hpp index b9ee0cb..128cb1a 100644 --- a/include/query/streaming_query.hpp +++ b/include/query/streaming_query.hpp @@ -9,8 +9,8 @@ namespace sshash { -template -streaming_query_report streaming_query_from_fasta_file_multiline(dictionary const* dict, +template +streaming_query_report streaming_query_from_fasta_file_multiline(dictionary const* dict, std::istream& is) { streaming_query_report report; buffered_lines_iterator it(is); @@ -42,8 +42,8 @@ streaming_query_report streaming_query_from_fasta_file_multiline(dictionary cons return report; } -template -streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, std::istream& is) { +template +streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, std::istream& is) { streaming_query_report report; std::string line; uint64_t k = dict->k(); @@ -65,8 +65,8 @@ streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, s return report; } -template -streaming_query_report streaming_query_from_fastq_file(dictionary const* dict, std::istream& is) { +template +streaming_query_report streaming_query_from_fastq_file(dictionary const* dict, std::istream& is) { streaming_query_report report; std::string line; uint64_t k = dict->k(); @@ -92,14 +92,15 @@ streaming_query_report streaming_query_from_fastq_file(dictionary const* dict, s return report; } -template -streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, std::istream& is, +template +streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, std::istream& is, bool multiline) { - if (multiline) return streaming_query_from_fasta_file_multiline(dict, is); - return streaming_query_from_fasta_file(dict, is); + if (multiline) return streaming_query_from_fasta_file_multiline(dict, is); + return streaming_query_from_fasta_file(dict, is); } -streaming_query_report dictionary::streaming_query_from_file(std::string const& filename, +template +streaming_query_report dictionary::streaming_query_from_file(std::string const& filename, bool multiline) const { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); @@ -109,10 +110,10 @@ streaming_query_report dictionary::streaming_query_from_file(std::string const& zip_istream zis(is); if (canonicalized()) { - report = streaming_query_from_fasta_file(this, zis, + report = streaming_query_from_fasta_file>(this, zis, multiline); } else { - report = streaming_query_from_fasta_file(this, zis, + report = streaming_query_from_fasta_file>(this, zis, multiline); } @@ -124,17 +125,17 @@ streaming_query_report dictionary::streaming_query_from_file(std::string const& zip_istream zis(is); if (canonicalized()) { - report = streaming_query_from_fastq_file(this, zis); + report = streaming_query_from_fastq_file>(this, zis); } else { - report = streaming_query_from_fastq_file(this, zis); + report = streaming_query_from_fastq_file>(this, zis); } } else if (util::ends_with(filename, ".fa") or util::ends_with(filename, ".fasta")) { if (canonicalized()) { - report = streaming_query_from_fasta_file(this, is, + report = streaming_query_from_fasta_file>(this, is, multiline); } else { - report = streaming_query_from_fasta_file(this, is, + report = streaming_query_from_fasta_file>(this, is, multiline); } @@ -144,9 +145,9 @@ streaming_query_report dictionary::streaming_query_from_file(std::string const& << std::endl; } if (canonicalized()) { - report = streaming_query_from_fastq_file(this, is); + report = streaming_query_from_fastq_file>(this, is); } else { - report = streaming_query_from_fastq_file(this, is); + report = streaming_query_from_fastq_file>(this, is); } } else { diff --git a/include/query/streaming_query_canonical_parsing.hpp b/include/query/streaming_query_canonical_parsing.hpp index 3afeafb..bf122be 100644 --- a/include/query/streaming_query_canonical_parsing.hpp +++ b/include/query/streaming_query_canonical_parsing.hpp @@ -6,8 +6,9 @@ namespace sshash { +template struct streaming_query_canonical_parsing { - streaming_query_canonical_parsing(dictionary const* dict) + streaming_query_canonical_parsing(dictionary const* dict) : m_dict(dict) @@ -54,17 +55,17 @@ struct streaming_query_canonical_parsing { /* 2. compute kmer and minimizer */ if (!m_start) { m_kmer >>= 2; - m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift; - assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); + m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift; + assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); } else { - m_kmer = util::string_to_uint_kmer(kmer, m_k); + m_kmer = util::string_to_uint_kmer(kmer, m_k); } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); - assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = util::compute_reverse_complement(m_kmer, m_k); + assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); + m_kmer_rc = util::compute_reverse_complement(m_kmer, m_k); constexpr bool reverse = true; - uint64_t minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start); - assert(minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); + uint64_t minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); + assert(minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); m_curr_minimizer = std::min(m_curr_minimizer, minimizer_rc); /* 3. compute result */ @@ -101,14 +102,14 @@ struct streaming_query_canonical_parsing { uint64_t num_extensions() const { return m_num_extensions; } private: - dictionary const* m_dict; + dictionary const* m_dict; /* result */ lookup_result m_res; /* (kmer,minimizer) state */ - minimizer_enumerator<> m_minimizer_enum; - minimizer_enumerator<> m_minimizer_enum_rc; + minimizer_enumerator m_minimizer_enum; + minimizer_enumerator m_minimizer_enum_rc; bool m_minimizer_not_found; bool m_start; uint64_t m_curr_minimizer, m_prev_minimizer; @@ -118,7 +119,7 @@ struct streaming_query_canonical_parsing { uint64_t m_shift, m_k, m_m, m_seed; /* string state */ - bit_vector_iterator m_string_iterator; + bit_vector_iterator m_string_iterator; uint64_t m_begin, m_end; uint64_t m_pos_in_window, m_window_size; bool m_reverse; diff --git a/include/query/streaming_query_regular_parsing.hpp b/include/query/streaming_query_regular_parsing.hpp index c3f34f1..2d13ff1 100644 --- a/include/query/streaming_query_regular_parsing.hpp +++ b/include/query/streaming_query_regular_parsing.hpp @@ -6,8 +6,9 @@ namespace sshash { +template struct streaming_query_regular_parsing { - streaming_query_regular_parsing(dictionary const* dict) + streaming_query_regular_parsing(dictionary const* dict) : m_dict(dict) @@ -58,17 +59,17 @@ struct streaming_query_regular_parsing { /* 2. compute kmer and minimizer */ if (!m_start) { m_kmer >>= 2; - m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift; - assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); + m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift; + assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); } else { - m_kmer = util::string_to_uint_kmer(kmer, m_k); + m_kmer = util::string_to_uint_kmer(kmer, m_k); } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); - assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = util::compute_reverse_complement(m_kmer, m_k); + assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); + m_kmer_rc = util::compute_reverse_complement(m_kmer, m_k); constexpr bool reverse = true; - m_curr_minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start); - assert(m_curr_minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); + m_curr_minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); + assert(m_curr_minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); bool both_minimizers_not_found = (same_minimizer() and m_minimizer_not_found) and (same_minimizer_rc() and m_minimizer_rc_not_found); @@ -129,14 +130,14 @@ struct streaming_query_regular_parsing { uint64_t num_extensions() const { return m_num_extensions; } private: - dictionary const* m_dict; + dictionary const* m_dict; /* result */ lookup_result m_res; /* (kmer,minimizer) state */ - minimizer_enumerator<> m_minimizer_enum; - minimizer_enumerator<> m_minimizer_enum_rc; + minimizer_enumerator m_minimizer_enum; + minimizer_enumerator m_minimizer_enum_rc; bool m_minimizer_not_found, m_minimizer_rc_not_found; bool m_start; uint64_t m_curr_minimizer, m_curr_minimizer_rc; @@ -147,7 +148,7 @@ struct streaming_query_regular_parsing { uint64_t m_shift, m_k, m_m, m_seed; /* string state */ - bit_vector_iterator m_string_iterator; + bit_vector_iterator m_string_iterator; uint64_t m_begin, m_end; uint64_t m_pos_in_window, m_window_size; bool m_reverse; diff --git a/include/skew_index.hpp b/include/skew_index.hpp index ca9b2ef..b2591ab 100644 --- a/include/skew_index.hpp +++ b/include/skew_index.hpp @@ -4,6 +4,7 @@ namespace sshash { +template struct skew_index { skew_index() : min_log2(constants::min_l) @@ -14,7 +15,25 @@ struct skew_index { } /* Returns the number of k-mers in the index. */ - uint64_t print_info() const; + uint64_t print_info() const { + uint64_t num_partitions = mphfs.size(); + uint64_t lower = 1ULL << min_log2; + uint64_t upper = 2 * lower; + uint64_t num_kmers_in_skew_index = 0; + for (uint64_t partition_id = 0; partition_id != num_partitions; ++partition_id) { + uint64_t n = mphfs[partition_id].num_keys(); + assert(n == positions[partition_id].size()); + std::cout << "num_kmers belonging to buckets of size > " << lower << " and <= " << upper + << ": " << n << "; "; + std::cout << "bits/kmer = " << static_cast(mphfs[partition_id].num_bits()) / n + << " (mphf) + " << (positions[partition_id].bytes() * 8.0) / n + << " (positions)\n"; + num_kmers_in_skew_index += n; + lower = upper; + upper = 2 * lower; + } + return num_kmers_in_skew_index; + } bool empty() const { return mphfs.empty(); } @@ -54,8 +73,8 @@ struct skew_index { uint16_t min_log2; uint16_t max_log2; uint32_t log2_max_num_super_kmers_in_bucket; - std::vector mphfs; + std::vector> mphfs; std::vector positions; }; -} // namespace sshash \ No newline at end of file +} // namespace sshash diff --git a/include/statistics.cpp b/include/statistics.impl similarity index 93% rename from include/statistics.cpp rename to include/statistics.impl index 3b77d67..885a1b8 100644 --- a/include/statistics.cpp +++ b/include/statistics.impl @@ -1,9 +1,9 @@ -#include "dictionary.hpp" #include "buckets_statistics.hpp" namespace sshash { -void dictionary::compute_statistics() const { +template +void dictionary::compute_statistics() const { uint64_t num_kmers = size(); uint64_t num_minimizers = m_minimizers.size(); uint64_t num_super_kmers = m_buckets.offsets.size(); @@ -20,7 +20,7 @@ void dictionary::compute_statistics() const { uint64_t offset = m_buckets.offsets.access(super_kmer_id); auto [_, contig_end] = m_buckets.offset_to_id(offset, m_k); (void)_; - bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); + bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); uint64_t window_size = std::min(m_k - m_m + 1, contig_end - offset - m_k + 1); uint64_t prev_minimizer = constants::invalid_uint64; uint64_t w = 0; diff --git a/include/util.hpp b/include/util.hpp index 4c5645d..5cf6748 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -139,6 +139,7 @@ char decimal binary g 103 01100-11-1 -> 11 t 116 01110-10-0 -> 10 */ +template static kmer_t char_to_uint(char c) { return (c >> 1) & 3; } static char uint64_to_char(uint64_t x) { @@ -147,13 +148,15 @@ static char uint64_to_char(uint64_t x) { return nucleotides[x]; } +template [[maybe_unused]] static kmer_t string_to_uint_kmer(char const* str, uint64_t k) { assert(k <= constants::max_k); kmer_t x = 0; - for (uint64_t i = 0; i != k; ++i) x += char_to_uint(str[i]) << (2 * i); + for (uint64_t i = 0; i != k; ++i) x += char_to_uint(str[i]) << (2 * i); return x; } +template static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) { assert(k <= constants::max_k); for (uint64_t i = 0; i != k; ++i) { @@ -162,6 +165,7 @@ static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) { } } +template [[maybe_unused]] static std::string uint_kmer_to_string(kmer_t x, uint64_t k) { assert(k <= constants::max_k); std::string str; @@ -197,6 +201,7 @@ template return res; } +template [[maybe_unused]] static kmer_t compute_reverse_complement(kmer_t x, uint64_t k) { assert(k <= constants::max_k); if constexpr (constants::uint_kmer_bits == 64) { @@ -288,7 +293,7 @@ static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c] return true; } -template +template uint64_t compute_minimizer(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { assert(m <= constants::max_m); assert(m <= k); @@ -308,7 +313,7 @@ uint64_t compute_minimizer(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { } /* used in dump.cpp */ -template +template std::pair compute_minimizer_pos(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { assert(m <= constants::max_m); diff --git a/src/bench_utils.hpp b/src/bench_utils.hpp index fa6dc59..c7f70ba 100644 --- a/src/bench_utils.hpp +++ b/src/bench_utils.hpp @@ -2,7 +2,8 @@ namespace sshash { -void perf_test_iterator(dictionary const& dict) { +template +void perf_test_iterator(dictionary const& dict) { essentials::timer t; t.start(); auto it = dict.begin(); @@ -16,7 +17,8 @@ void perf_test_iterator(dictionary const& dict) { std::cout << "iterator: avg_nanosec_per_kmer " << avg_nanosec << std::endl; } -void perf_test_lookup_access(dictionary const& dict) { +template +void perf_test_lookup_access(dictionary const& dict) { constexpr uint64_t num_queries = 1000000; constexpr uint64_t runs = 5; essentials::uniform_int_rng distr(0, dict.size() - 1, essentials::get_random_seed()); @@ -138,7 +140,8 @@ void perf_test_lookup_access(dictionary const& dict) { } } -void perf_test_lookup_weight(dictionary const& dict) { +template +void perf_test_lookup_weight(dictionary const& dict) { if (!dict.weighted()) { std::cerr << "ERROR: the dictionary does not store weights" << std::endl; return; diff --git a/src/check_utils.hpp b/src/check_utils.hpp index 0a5c85d..fa2aa9c 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -6,7 +6,8 @@ namespace sshash { -bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { +template +bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); uint64_t n = dict.size(); @@ -41,7 +42,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { for (uint64_t i = 0; i + k <= line.size(); ++i) { assert(util::is_valid(line.data() + i, k)); - kmer_t uint_kmer = util::string_to_uint_kmer(line.data() + i, k); + kmer_t uint_kmer = util::string_to_uint_kmer(line.data() + i, k); bool orientation = constants::forward_orientation; if (num_kmers != 0 and num_kmers % 5000000 == 0) { @@ -138,8 +139,8 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { // check access dict.access(id, got_kmer_str.data()); - kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k); - kmer_t got_uint_kmer_rc = util::compute_reverse_complement(got_uint_kmer, k); + kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k); + kmer_t got_uint_kmer_rc = util::compute_reverse_complement(got_uint_kmer, k); if (got_uint_kmer != uint_kmer and got_uint_kmer_rc != uint_kmer) { std::cout << "ERROR: got '" << got_kmer_str << "' but expected '" << expected_kmer_str << "'" << std::endl; @@ -176,7 +177,8 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { return true; } -bool check_correctness_navigational_kmer_query(std::istream& is, dictionary const& dict) { +template +bool check_correctness_navigational_kmer_query(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); std::string line; uint64_t pos = 0; @@ -271,7 +273,8 @@ bool check_correctness_navigational_kmer_query(std::istream& is, dictionary cons return true; } -bool check_correctness_navigational_contig_query(dictionary const& dict) { +template +bool check_correctness_navigational_contig_query(dictionary const& dict) { std::cout << "checking correctness of navigational queries for contigs..." << std::endl; uint64_t num_contigs = dict.num_contigs(); uint64_t k = dict.k(); @@ -309,7 +312,8 @@ bool check_correctness_navigational_contig_query(dictionary const& dict) { return true; } -bool check_correctness_weights(std::istream& is, dictionary const& dict) { +template +bool check_correctness_weights(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); std::string line; uint64_t kmer_id = 0; @@ -364,7 +368,8 @@ bool check_correctness_weights(std::istream& is, dictionary const& dict) { The input file must be the one the index was built from. Throughout the code, we assume the input does not contain any duplicate. */ -bool check_correctness_lookup_access(dictionary const& dict, std::string const& filename) { +template +bool check_correctness_lookup_access(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); bool good = true; @@ -382,7 +387,8 @@ bool check_correctness_lookup_access(dictionary const& dict, std::string const& The input file must be the one the index was built from. Throughout the code, we assume the input does not contain any duplicate. */ -bool check_correctness_navigational_kmer_query(dictionary const& dict, +template +bool check_correctness_navigational_kmer_query(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); @@ -400,7 +406,8 @@ bool check_correctness_navigational_kmer_query(dictionary const& dict, /* The input file must be the one the index was built from. */ -bool check_correctness_weights(dictionary const& dict, std::string const& filename) { +template +bool check_correctness_weights(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); bool good = true; @@ -414,7 +421,8 @@ bool check_correctness_weights(dictionary const& dict, std::string const& filena return good; } -bool check_dictionary(dictionary const& dict) { +template +bool check_dictionary(dictionary const& dict) { uint64_t k = dict.k(); uint64_t n = dict.size(); std::cout << "checking correctness of access and positive lookup..." << std::endl; @@ -442,7 +450,8 @@ bool check_dictionary(dictionary const& dict) { return true; } -bool check_correctness_iterator(dictionary const& dict) { +template +bool check_correctness_iterator(dictionary const& dict) { std::cout << "checking correctness of iterator..." << std::endl; std::string expected_kmer(dict.k(), 0); constexpr uint64_t runs = 3; diff --git a/src/common.hpp b/src/common.hpp index 00f49ec..dcd07c9 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -11,7 +11,8 @@ void random_kmer(char* kmer, uint64_t k) { for (uint64_t i = 0; i != k; ++i) kmer[i] = "ACGT"[rand() % 4]; } -void load_dictionary(dictionary& dict, std::string const& index_filename, bool verbose) { +template +void load_dictionary(dictionary& dict, std::string const& index_filename, bool verbose) { uint64_t num_bytes_read = essentials::load(dict, index_filename.c_str()); if (verbose) { std::cout << "index size: " << essentials::convert(num_bytes_read, essentials::MB) diff --git a/src/sshash.cpp b/src/sshash.cpp index f6e8ac3..674b881 100644 --- a/src/sshash.cpp +++ b/src/sshash.cpp @@ -9,6 +9,8 @@ using namespace sshash; +using kmer_t = default_kmer_t; + int check(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -16,7 +18,7 @@ int check(int argc, char** argv) { if (!parser.parse()) return 1; auto index_filename = parser.get("index_filename"); bool verbose = parser.get("verbose"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); check_dictionary(dict); check_correctness_navigational_contig_query(dict); @@ -30,7 +32,7 @@ int bench(int argc, char** argv) { if (!parser.parse()) return 1; auto index_filename = parser.get("index_filename"); bool verbose = parser.get("verbose"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); perf_test_lookup_access(dict); if (dict.weighted()) perf_test_lookup_weight(dict); @@ -47,7 +49,7 @@ int dump(int argc, char** argv) { auto index_filename = parser.get("index_filename"); auto output_filename = parser.get("output_filename"); bool verbose = parser.get("verbose"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); dict.dump(output_filename); return 0; @@ -60,7 +62,7 @@ int compute_statistics(int argc, char** argv) { if (!parser.parse()) return 1; auto index_filename = parser.get("index_filename"); bool verbose = parser.get("verbose"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); dict.compute_statistics(); return 0; diff --git a/test/test_alphabet.cpp b/test/test_alphabet.cpp index 6a08d2c..3cc131e 100644 --- a/test/test_alphabet.cpp +++ b/test/test_alphabet.cpp @@ -15,6 +15,8 @@ void expect(T got, T expected) { } } +using kmer_t = default_kmer_t; + int main(int /*argc*/, char** argv) { uint64_t k = std::stoull(argv[1]); std::string read( @@ -33,24 +35,24 @@ int main(int /*argc*/, char** argv) { << std::endl; } std::cout << "read: '" << std::string(read.data() + i, k) << "'; "; - kmer_t x = util::string_to_uint_kmer(read.data() + i, k); - std::string kmer = util::uint_kmer_to_string(x, k); + default_kmer_t x = util::string_to_uint_kmer(read.data() + i, k); + std::string kmer = util::uint_kmer_to_string(x, k); std::cout << "capitalized: '" << kmer << "'" << std::endl; } /****/ - expect(util::char_to_uint('A'), kmer_t(0)); - expect(util::char_to_uint('a'), kmer_t(0)); + expect(util::char_to_uint('A'), kmer_t(0)); + expect(util::char_to_uint('a'), kmer_t(0)); - expect(util::char_to_uint('C'), kmer_t(1)); - expect(util::char_to_uint('c'), kmer_t(1)); + expect(util::char_to_uint('C'), kmer_t(1)); + expect(util::char_to_uint('c'), kmer_t(1)); - expect(util::char_to_uint('T'), kmer_t(2)); - expect(util::char_to_uint('t'), kmer_t(2)); + expect(util::char_to_uint('T'), kmer_t(2)); + expect(util::char_to_uint('t'), kmer_t(2)); - expect(util::char_to_uint('G'), kmer_t(3)); - expect(util::char_to_uint('g'), kmer_t(3)); + expect(util::char_to_uint('G'), kmer_t(3)); + expect(util::char_to_uint('g'), kmer_t(3)); /****/ From 8ef1103de90386b04179138ae994d5634f901b40 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 13 Dec 2023 16:06:11 +0100 Subject: [PATCH 02/21] Apply clang-format --- include/bit_vector_iterator.hpp | 2 +- include/buckets.hpp | 2 +- include/builder/build.impl | 5 +-- include/dictionary.hpp | 6 ++-- include/dictionary.impl | 60 ++++++++++++++++++--------------- include/dump.impl | 2 +- include/hash_util.hpp | 16 ++++----- include/info.impl | 4 +-- include/skew_index.hpp | 8 ++--- include/statistics.impl | 2 +- include/util.hpp | 14 ++++---- 11 files changed, 63 insertions(+), 58 deletions(-) diff --git a/include/bit_vector_iterator.hpp b/include/bit_vector_iterator.hpp index 0393cbe..325b197 100644 --- a/include/bit_vector_iterator.hpp +++ b/include/bit_vector_iterator.hpp @@ -5,7 +5,7 @@ namespace sshash { -template +template struct bit_vector_iterator { bit_vector_iterator() : m_bv(nullptr) {} diff --git a/include/buckets.hpp b/include/buckets.hpp index f076ec1..dff43cd 100644 --- a/include/buckets.hpp +++ b/include/buckets.hpp @@ -6,7 +6,7 @@ namespace sshash { -template +template struct buckets { std::pair offset_to_id(uint64_t offset, uint64_t k) const { auto [pos, contig_begin, contig_end] = pieces.locate(offset); diff --git a/include/builder/build.impl b/include/builder/build.impl index d4c5f63..5c3b385 100644 --- a/include/builder/build.impl +++ b/include/builder/build.impl @@ -11,8 +11,9 @@ namespace sshash { -template -void dictionary::build(std::string const& filename, build_configuration const& build_config) { +template +void dictionary::build(std::string const& filename, + build_configuration const& build_config) { /* Validate the build configuration. */ if (build_config.k == 0) throw std::runtime_error("k must be > 0"); if (build_config.k > constants::max_k) { diff --git a/include/dictionary.hpp b/include/dictionary.hpp index f546819..d70f4a0 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -9,13 +9,13 @@ namespace sshash { // Forward declarations of the friend template classes -template +template struct streaming_query_canonical_parsing; -template +template struct streaming_query_regular_parsing; -template +template struct dictionary { dictionary() : m_size(0), m_seed(0), m_k(0), m_m(0), m_canonical_parsing(0) {} diff --git a/include/dictionary.impl b/include/dictionary.impl index b8e83cf..708db7f 100644 --- a/include/dictionary.impl +++ b/include/dictionary.impl @@ -1,6 +1,6 @@ namespace sshash { -template +template lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) const { uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); uint64_t bucket_id = m_minimizers.lookup(minimizer); @@ -22,7 +22,7 @@ lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) return m_buckets.lookup(begin, end, uint_kmer, m_k, m_m); } -template +template lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const { kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, m_k); uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); @@ -55,26 +55,26 @@ lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer return m_buckets.lookup_canonical(begin, end, uint_kmer, uint_kmer_rc, m_k, m_m); } -template +template uint64_t dictionary::lookup(char const* string_kmer, bool check_reverse_complement) const { kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return lookup_uint(uint_kmer, check_reverse_complement); } -template +template uint64_t dictionary::lookup_uint(kmer_t uint_kmer, bool check_reverse_complement) const { auto res = lookup_advanced_uint(uint_kmer, check_reverse_complement); return res.kmer_id; } -template +template lookup_result dictionary::lookup_advanced(char const* string_kmer, - bool check_reverse_complement) const { + bool check_reverse_complement) const { kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return lookup_advanced_uint(uint_kmer, check_reverse_complement); } -template +template lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, - bool check_reverse_complement) const { + bool check_reverse_complement) const { if (m_canonical_parsing) return lookup_uint_canonical_parsing(uint_kmer); auto res = lookup_uint_regular_parsing(uint_kmer); assert(res.kmer_orientation == constants::forward_orientation); @@ -86,28 +86,28 @@ lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, return res; } -template +template bool dictionary::is_member(char const* string_kmer, bool check_reverse_complement) const { return lookup(string_kmer, check_reverse_complement) != constants::invalid_uint64; } -template +template bool dictionary::is_member_uint(kmer_t uint_kmer, bool check_reverse_complement) const { return lookup_uint(uint_kmer, check_reverse_complement) != constants::invalid_uint64; } -template +template void dictionary::access(uint64_t kmer_id, char* string_kmer) const { assert(kmer_id < size()); m_buckets.access(kmer_id, string_kmer, m_k); } -template +template uint64_t dictionary::weight(uint64_t kmer_id) const { assert(kmer_id < size()); return m_weights.weight(kmer_id); } -template +template uint64_t dictionary::contig_size(uint64_t contig_id) const { assert(contig_id < num_contigs()); uint64_t contig_length = m_buckets.contig_length(contig_id); @@ -115,14 +115,18 @@ uint64_t dictionary::contig_size(uint64_t contig_id) const { return contig_length - m_k + 1; } -template +template void dictionary::forward_neighbours(kmer_t suffix, neighbourhood& res) const { - res.forward_A = lookup_advanced_uint(suffix + (util::char_to_uint('A') << (2 * (m_k - 1)))); - res.forward_C = lookup_advanced_uint(suffix + (util::char_to_uint('C') << (2 * (m_k - 1)))); - res.forward_G = lookup_advanced_uint(suffix + (util::char_to_uint('G') << (2 * (m_k - 1)))); - res.forward_T = lookup_advanced_uint(suffix + (util::char_to_uint('T') << (2 * (m_k - 1)))); -} -template + res.forward_A = + lookup_advanced_uint(suffix + (util::char_to_uint('A') << (2 * (m_k - 1)))); + res.forward_C = + lookup_advanced_uint(suffix + (util::char_to_uint('C') << (2 * (m_k - 1)))); + res.forward_G = + lookup_advanced_uint(suffix + (util::char_to_uint('G') << (2 * (m_k - 1)))); + res.forward_T = + lookup_advanced_uint(suffix + (util::char_to_uint('T') << (2 * (m_k - 1)))); +} +template void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res) const { res.backward_A = lookup_advanced_uint(prefix + util::char_to_uint('A')); res.backward_C = lookup_advanced_uint(prefix + util::char_to_uint('C')); @@ -130,12 +134,12 @@ void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res) res.backward_T = lookup_advanced_uint(prefix + util::char_to_uint('T')); } -template +template neighbourhood dictionary::kmer_forward_neighbours(char const* string_kmer) const { kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return kmer_forward_neighbours(uint_kmer); } -template +template neighbourhood dictionary::kmer_forward_neighbours(kmer_t uint_kmer) const { neighbourhood res; kmer_t suffix = uint_kmer >> 2; @@ -143,12 +147,12 @@ neighbourhood dictionary::kmer_forward_neighbours(kmer_t uint_kmer) cons return res; } -template +template neighbourhood dictionary::kmer_backward_neighbours(char const* string_kmer) const { kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return kmer_backward_neighbours(uint_kmer); } -template +template neighbourhood dictionary::kmer_backward_neighbours(kmer_t uint_kmer) const { neighbourhood res; kmer_t prefix = (uint_kmer << 2) & ((kmer_t(1) << (2 * m_k)) - 1); @@ -156,12 +160,12 @@ neighbourhood dictionary::kmer_backward_neighbours(kmer_t uint_kmer) con return res; } -template +template neighbourhood dictionary::kmer_neighbours(char const* string_kmer) const { kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return kmer_neighbours(uint_kmer); } -template +template neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer) const { neighbourhood res; kmer_t suffix = uint_kmer >> 2; @@ -171,7 +175,7 @@ neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer) const { return res; } -template +template neighbourhood dictionary::contig_neighbours(uint64_t contig_id) const { assert(contig_id < num_contigs()); neighbourhood res; @@ -182,7 +186,7 @@ neighbourhood dictionary::contig_neighbours(uint64_t contig_id) const { return res; } -template +template uint64_t dictionary::num_bits() const { return 8 * (sizeof(m_size) + sizeof(m_seed) + sizeof(m_k) + sizeof(m_m) + sizeof(m_canonical_parsing)) + diff --git a/include/dump.impl b/include/dump.impl index 8c7d005..52846ef 100644 --- a/include/dump.impl +++ b/include/dump.impl @@ -1,6 +1,6 @@ namespace sshash { -template +template void dictionary::dump(std::string const& filename) const { uint64_t num_kmers = size(); uint64_t num_minimizers = m_minimizers.size(); diff --git a/include/hash_util.hpp b/include/hash_util.hpp index 5685104..ddc94ec 100644 --- a/include/hash_util.hpp +++ b/include/hash_util.hpp @@ -5,7 +5,7 @@ namespace sshash { -template +template struct kmers_pthash_hasher_64 { typedef pthash::hash64 hash_type; @@ -25,7 +25,7 @@ struct kmers_pthash_hasher_64 { } }; -template +template struct kmers_pthash_hasher_128 { typedef pthash::hash128 hash_type; @@ -52,7 +52,7 @@ struct kmers_pthash_hasher_128 { // typedef pthash::murmurhash2_64 minimizers_base_hasher_type; typedef pthash::murmurhash2_128 minimizers_base_hasher_type; -template +template using kmers_base_hasher_type = kmers_pthash_hasher_128; typedef pthash::single_phf minimizers_pthash_type; -template -using kmers_pthash_type = pthash::single_phf< - kmers_base_hasher_type, // base hasher - pthash::dictionary_dictionary, // encoder type - true>; // minimal output - +template +using kmers_pthash_type = pthash::single_phf, // base hasher + pthash::dictionary_dictionary, // encoder type + true>; // minimal output /* used to hash m-mers and determine the minimizer of a k-mer */ struct murmurhash2_64 { diff --git a/include/info.impl b/include/info.impl index 649097f..fce7f8f 100644 --- a/include/info.impl +++ b/include/info.impl @@ -1,6 +1,6 @@ namespace sshash { -template +template void dictionary::print_space_breakdown() const { std::cout << "total index size: " << essentials::convert((num_bits() + 7) / 8, essentials::MB) << " [MB]" << '\n'; @@ -26,7 +26,7 @@ void dictionary::print_space_breakdown() const { << std::endl; } -template +template void dictionary::print_info() const { std::cout << "=== dictionary info:\n"; std::cout << "num_kmers = " << size() << '\n'; diff --git a/include/skew_index.hpp b/include/skew_index.hpp index b2591ab..f3473ce 100644 --- a/include/skew_index.hpp +++ b/include/skew_index.hpp @@ -4,7 +4,7 @@ namespace sshash { -template +template struct skew_index { skew_index() : min_log2(constants::min_l) @@ -24,10 +24,10 @@ struct skew_index { uint64_t n = mphfs[partition_id].num_keys(); assert(n == positions[partition_id].size()); std::cout << "num_kmers belonging to buckets of size > " << lower << " and <= " << upper - << ": " << n << "; "; + << ": " << n << "; "; std::cout << "bits/kmer = " << static_cast(mphfs[partition_id].num_bits()) / n - << " (mphf) + " << (positions[partition_id].bytes() * 8.0) / n - << " (positions)\n"; + << " (mphf) + " << (positions[partition_id].bytes() * 8.0) / n + << " (positions)\n"; num_kmers_in_skew_index += n; lower = upper; upper = 2 * lower; diff --git a/include/statistics.impl b/include/statistics.impl index 885a1b8..61b931e 100644 --- a/include/statistics.impl +++ b/include/statistics.impl @@ -2,7 +2,7 @@ namespace sshash { -template +template void dictionary::compute_statistics() const { uint64_t num_kmers = size(); uint64_t num_minimizers = m_minimizers.size(); diff --git a/include/util.hpp b/include/util.hpp index 5cf6748..87ebb41 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -139,8 +139,10 @@ char decimal binary g 103 01100-11-1 -> 11 t 116 01110-10-0 -> 10 */ -template -static kmer_t char_to_uint(char c) { return (c >> 1) & 3; } +template +static kmer_t char_to_uint(char c) { + return (c >> 1) & 3; +} static char uint64_to_char(uint64_t x) { assert(x <= 3); @@ -148,7 +150,7 @@ static char uint64_to_char(uint64_t x) { return nucleotides[x]; } -template +template [[maybe_unused]] static kmer_t string_to_uint_kmer(char const* str, uint64_t k) { assert(k <= constants::max_k); kmer_t x = 0; @@ -156,7 +158,7 @@ template return x; } -template +template static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) { assert(k <= constants::max_k); for (uint64_t i = 0; i != k; ++i) { @@ -165,7 +167,7 @@ static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) { } } -template +template [[maybe_unused]] static std::string uint_kmer_to_string(kmer_t x, uint64_t k) { assert(k <= constants::max_k); std::string str; @@ -201,7 +203,7 @@ template return res; } -template +template [[maybe_unused]] static kmer_t compute_reverse_complement(kmer_t x, uint64_t k) { assert(k <= constants::max_k); if constexpr (constants::uint_kmer_bits == 64) { From 3a7062a882ab98e41b56c5c07956f6615f9906da Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 10 Jan 2024 18:30:00 +0100 Subject: [PATCH 03/21] Introduce template-based kmer class --- include/bit_vector_iterator.hpp | 102 +++++------- include/buckets.hpp | 14 +- include/builder/build.impl | 4 +- include/builder/build_index.hpp | 5 +- include/builder/build_skew_index.hpp | 6 +- include/builder/parse_file.hpp | 30 ++-- include/builder/util.hpp | 14 +- include/constants.hpp | 5 - include/dictionary.hpp | 22 +-- include/dictionary.impl | 92 ++++++----- include/dump.impl | 10 +- include/hash_util.hpp | 8 +- include/kmer.hpp | 151 +++++++++++++++++- include/minimizer_enumerator.hpp | 34 ++-- include/query/streaming_query.hpp | 50 +++--- .../streaming_query_canonical_parsing.hpp | 18 +-- .../query/streaming_query_regular_parsing.hpp | 14 +- include/statistics.impl | 6 +- include/util.hpp | 130 +++------------ src/bench_utils.hpp | 6 +- src/build.cpp | 6 +- src/check_utils.hpp | 99 ++++-------- src/common.hpp | 2 +- src/permute.cpp | 8 +- src/query.cpp | 3 +- src/sshash.cpp | 20 +-- test/test_alphabet.cpp | 18 +-- 27 files changed, 447 insertions(+), 430 deletions(-) diff --git a/include/bit_vector_iterator.hpp b/include/bit_vector_iterator.hpp index 325b197..bbe24a3 100644 --- a/include/bit_vector_iterator.hpp +++ b/include/bit_vector_iterator.hpp @@ -18,79 +18,67 @@ struct bit_vector_iterator { } inline kmer_t read(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); - kmer_t val = 0; - if (l != constants::uint_kmer_bits) { - val = m_buf & ((kmer_t(1) << l) - 1); - } else { - val = m_buf; - } + kmer_t val = m_buf; + val.take(l); return val; } inline kmer_t read_reverse(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf_reverse(); - kmer_t val = 0; - if (l != constants::uint_kmer_bits) { - uint64_t shift = (l >= 64) ? (constants::uint_kmer_bits - l) : 64; - val = m_buf >> shift; - } else { - val = m_buf; - } + kmer_t val = m_buf; + val.drop(kmer_t::uint_kmer_bits - l); return val; } inline void eat(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); - if (l != constants::uint_kmer_bits) m_buf >>= l; + if (l != kmer_t::uint_kmer_bits) m_buf.drop(l); m_avail -= l; m_pos += l; } inline void eat_reverse(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf_reverse(); - if (l != constants::uint_kmer_bits) m_buf <<= l; + if (l != kmer_t::uint_kmer_bits) m_buf.pad(l); m_avail -= l; m_pos -= l; } - inline kmer_t read_and_advance_by_two(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + inline kmer_t read_and_advance_by_char(uint64_t l) { + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); - kmer_t val = 0; - if (l != constants::uint_kmer_bits) { - val = m_buf & ((kmer_t(1) << l) - 1); - m_buf >>= 2; - } else { - val = m_buf; + kmer_t val = m_buf; + if (l != kmer_t::uint_kmer_bits) { + val.take(l); + m_buf.drop_char(); } m_avail -= 2; m_pos += 2; return val; } - inline kmer_t get_next_two_bits() { - if (m_avail < 2) fill_buf(); - kmer_t val = m_buf & 3; - m_buf >>= 2; + inline uint64_t get_next_char() { + if (m_avail < kmer_t::bits_per_char) fill_buf(); + kmer_t val = m_buf; + val.take_char(); + m_buf.drop_char(); m_avail -= 2; m_pos += 2; - return val; + return uint64_t(val); } inline kmer_t take(uint64_t l) { - assert(l <= constants::uint_kmer_bits); + assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); - kmer_t val = 0; - if (l != constants::uint_kmer_bits) { - val = m_buf & ((kmer_t(1) << l) - 1); - m_buf >>= l; - } else { - val = m_buf; + kmer_t val = m_buf; + if (l != kmer_t::uint_kmer_bits) { + val.take(l); + m_buf.drop(l); } m_avail -= l; m_pos += l; @@ -101,38 +89,20 @@ struct bit_vector_iterator { private: inline void fill_buf() { - if constexpr (constants::uint_kmer_bits == 64) { - m_buf = m_bv->get_word64(m_pos); - } else { - assert(constants::uint_kmer_bits == 128); - m_buf = static_cast(m_bv->get_word64(m_pos)); - m_buf += static_cast(m_bv->get_word64(m_pos + 64)) << 64; + static_assert(kmer_t::uint_kmer_bits % 64 == 0); + for (int i = kmer_t::uint_kmer_bits - 64; i >= 0; i -= 64) { + m_buf.append64(m_bv->get_word64(m_pos + i)); } - m_avail = constants::uint_kmer_bits; + m_avail = kmer_t::uint_kmer_bits; } inline void fill_buf_reverse() { - if constexpr (constants::uint_kmer_bits == 64) { - if (m_pos < 64) { - m_buf = m_bv->get_word64(0); - m_avail = m_pos; - m_buf <<= (64 - m_pos); - return; - } - m_buf = m_bv->get_word64(m_pos - 64); - } else { - assert(constants::uint_kmer_bits == 128); - if (m_pos < 128) { - m_buf = static_cast(m_bv->get_word64(0)) << 64; - m_buf += static_cast(m_bv->get_word64(64)); - m_avail = m_pos; - m_buf <<= (128 - m_pos); - return; - } - m_buf = static_cast(m_bv->get_word64(m_pos - 128)) << 64; - m_buf += static_cast(m_bv->get_word64(m_pos - 64)); + static_assert(kmer_t::uint_kmer_bits % 64 == 0); + for (int i = kmer_t::uint_kmer_bits; i > 0; i -= 64) { + m_buf.append64(m_bv->get_word64(std::max(m_pos, kmer_t::uint_kmer_bits) - i)); } - m_avail = constants::uint_kmer_bits; + m_avail = std::min(m_pos, kmer_t::uint_kmer_bits); + m_buf.pad(kmer_t::uint_kmer_bits - m_avail); } pthash::bit_vector const* m_bv; diff --git a/include/buckets.hpp b/include/buckets.hpp index dff43cd..2584af8 100644 --- a/include/buckets.hpp +++ b/include/buckets.hpp @@ -94,7 +94,7 @@ struct buckets { bit_vector_iterator bv_it(strings, 2 * offset); uint64_t window_size = std::min(k - m + 1, contig_end - offset - k + 1); for (uint64_t w = 0; w != window_size; ++w) { - kmer_t read_kmer = bv_it.read_and_advance_by_two(2 * k); + kmer_t read_kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * k); if (read_kmer == target_kmer) { res.kmer_id += w; res.kmer_id_in_contig += w; @@ -119,7 +119,7 @@ struct buckets { bit_vector_iterator bv_it(strings, 2 * offset); uint64_t window_size = std::min(k - m + 1, contig_end - offset - k + 1); for (uint64_t w = 0; w != window_size; ++w) { - kmer_t read_kmer = bv_it.read_and_advance_by_two(2 * k); + kmer_t read_kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * k); if (read_kmer == target_kmer) { res.kmer_id += w; res.kmer_id_in_contig += w; @@ -200,12 +200,12 @@ struct buckets { util::uint_kmer_to_string(read_kmer, ret.second.data(), m_k); } else { memmove(ret.second.data(), ret.second.data() + 1, m_k - 1); - ret.second[m_k - 1] = util::uint64_to_char(last_two_bits); + ret.second[m_k - 1] = kmer_t::uint64_to_char(last_char); } clear = false; - read_kmer >>= 2; - last_two_bits = bv_it.get_next_two_bits(); - read_kmer += last_two_bits << (2 * (m_k - 1)); + read_kmer.drop_char(); + last_char = bv_it.get_next_char(); + read_kmer.add_kth_char(m_k - 1, last_char); ++m_kmer_id; ++offset; return ret; @@ -224,7 +224,7 @@ struct buckets { ef_sequence::iterator pieces_it; kmer_t read_kmer; - uint64_t last_two_bits; + uint64_t last_char; bool clear; void next_piece() { diff --git a/include/builder/build.impl b/include/builder/build.impl index 5c3b385..80484db 100644 --- a/include/builder/build.impl +++ b/include/builder/build.impl @@ -16,8 +16,8 @@ void dictionary::build(std::string const& filename, build_configuration const& build_config) { /* Validate the build configuration. */ if (build_config.k == 0) throw std::runtime_error("k must be > 0"); - if (build_config.k > constants::max_k) { - throw std::runtime_error("k must be less <= " + std::to_string(constants::max_k) + + if (build_config.k > kmer_t::max_k) { + throw std::runtime_error("k must be less <= " + std::to_string(kmer_t::max_k) + " but got k = " + std::to_string(build_config.k)); } if (build_config.m == 0) throw std::runtime_error("m must be > 0"); diff --git a/include/builder/build_index.hpp b/include/builder/build_index.hpp index 6c90c5f..1ad4a66 100644 --- a/include/builder/build_index.hpp +++ b/include/builder/build_index.hpp @@ -155,8 +155,9 @@ struct bucket_pairs { } }; -template -buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, buckets& m_buckets, +template +buckets_statistics build_index(parse_data& data, minimizers const& m_minimizers, + buckets& m_buckets, build_configuration const& build_config) { uint64_t num_buckets = m_minimizers.size(); uint64_t num_kmers = data.num_kmers; diff --git a/include/builder/build_skew_index.hpp b/include/builder/build_skew_index.hpp index 2bb06fd..38ee5bd 100644 --- a/include/builder/build_skew_index.hpp +++ b/include/builder/build_skew_index.hpp @@ -4,9 +4,9 @@ namespace sshash { -template -void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const& m_buckets, - build_configuration const& build_config, +template +void build_skew_index(skew_index& m_skew_index, parse_data& data, + buckets const& m_buckets, build_configuration const& build_config, buckets_statistics const& buckets_stats) { const uint64_t min_log2_size = m_skew_index.min_log2; const uint64_t max_log2_size = m_skew_index.max_log2; diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index 59eab93..02431a0 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -4,7 +4,7 @@ namespace sshash { -template +template struct parse_data { parse_data(std::string const& tmp_dirname) : num_kmers(0), minimizers(tmp_dirname) {} uint64_t num_kmers; @@ -13,8 +13,9 @@ struct parse_data { weights::builder weights_builder; }; -template -void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) { +template +void parse_file(std::istream& is, parse_data& data, + build_configuration const& build_config) { uint64_t k = build_config.k; uint64_t m = build_config.m; uint64_t seed = build_config.seed; @@ -34,7 +35,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration typename compact_string_pool::builder builder(k); std::string sequence; - uint64_t prev_minimizer = constants::invalid_uint64; + kmer_t prev_minimizer = constants::invalid_uint64; uint64_t begin = 0; // begin of parsed super_kmer in sequence uint64_t end = 0; // end of parsed super_kmer in sequence @@ -43,7 +44,8 @@ void parse_file(std::istream& is, parse_data& data, build_configuration bool glue = false; auto append_super_kmer = [&]() { - if (sequence.empty() or prev_minimizer == constants::invalid_uint64 or begin == end) { + if (sequence.empty() or prev_minimizer == kmer_t(constants::invalid_uint64) or + begin == end) { return; } @@ -62,7 +64,8 @@ void parse_file(std::istream& is, parse_data& data, build_configuration if (i == num_blocks - 1) n = size; uint64_t num_kmers_in_block = n - k + 1; assert(num_kmers_in_block <= max_num_kmers_in_super_kmer); - data.minimizers.emplace_back(prev_minimizer, builder.offset, num_kmers_in_block); + data.minimizers.emplace_back(uint64_t(prev_minimizer), builder.offset, + num_kmers_in_block); builder.append(super_kmer + i * max_num_kmers_in_super_kmer, n, glue); if (glue) { assert(data.minimizers.back().offset > k - 1); @@ -164,15 +167,15 @@ void parse_file(std::istream& is, parse_data& data, build_configuration char const* kmer = sequence.data() + end; assert(util::is_valid(kmer, k)); kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k); - uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed); + auto minimizer = util::compute_minimizer(uint_kmer, k, m, seed); if (build_config.canonical_parsing) { - kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, k); - uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); - minimizer = std::min(minimizer, minimizer_rc); + kmer_t uint_kmer_rc = uint_kmer.reverse_complement(k); + auto minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); + minimizer = std::min(minimizer, minimizer_rc); } - if (prev_minimizer == constants::invalid_uint64) prev_minimizer = minimizer; + if (prev_minimizer == kmer_t(constants::invalid_uint64)) prev_minimizer = minimizer; if (minimizer != prev_minimizer) { append_super_kmer(); begin = end; @@ -207,8 +210,9 @@ void parse_file(std::istream& is, parse_data& data, build_configuration } } -template -parse_data parse_file(std::string const& filename, build_configuration const& build_config) { +template +parse_data parse_file(std::string const& filename, + build_configuration const& build_config) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); std::cout << "reading file '" << filename << "'..." << std::endl; diff --git a/include/builder/util.hpp b/include/builder/util.hpp index aea32ca..cb5d354 100644 --- a/include/builder/util.hpp +++ b/include/builder/util.hpp @@ -26,7 +26,7 @@ struct parse_runtime_error : public std::runtime_error { } } -template +template struct compact_string_pool { compact_string_pool() {} @@ -47,25 +47,25 @@ struct compact_string_pool { } else { /* otherwise, start a new piece */ check_contig_size(); - pieces.push_back(bvb_strings.size() / 2); + pieces.push_back(bvb_strings.size() / kmer_t::bits_per_char); } for (uint64_t i = prefix; i != size; ++i) { - bvb_strings.append_bits(util::char_to_uint(string[i]), 2); + bvb_strings.append_bits(kmer_t::char_to_uint(string[i]), kmer_t::bits_per_char); } num_super_kmers += 1; - offset = bvb_strings.size() / 2; + offset = bvb_strings.size() / kmer_t::bits_per_char; } void finalize() { /* So pieces will be of size p+1, where p is the number of DNA sequences in the input file. */ check_contig_size(); - pieces.push_back(bvb_strings.size() / 2); + pieces.push_back(bvb_strings.size() / kmer_t::bits_per_char); assert(pieces.front() == 0); /* Push a final sentinel (dummy) kmer to avoid bounds' checking in bit_vector_iterator::fill_buf(). */ - bvb_strings.append_bits(0, 2 * k); + bvb_strings.append_bits(0, k * kmer_t::bits_per_char); } uint64_t k; @@ -79,7 +79,7 @@ struct compact_string_pool { /* Support a max of 2^32-1 contigs, or "pieces", whose max length must also be < 2^32. */ if (!pieces.empty()) { - uint64_t contig_length = bvb_strings.size() / 2 - pieces.back(); + uint64_t contig_length = bvb_strings.size() / kmer_t::bits_per_char - pieces.back(); if (contig_length >= 1ULL << 32) { throw std::runtime_error("contig_length " + std::to_string(contig_length) + " does not fit into 32 bits"); diff --git a/include/constants.hpp b/include/constants.hpp index f72b815..6faa80c 100644 --- a/include/constants.hpp +++ b/include/constants.hpp @@ -4,11 +4,6 @@ namespace sshash::constants { -constexpr uint64_t uint_kmer_bits = sizeof(default_kmer_t) * 8; - -/* max *odd* size that can be packed into sizeof(kmer_t)*8 bits */ -constexpr uint64_t max_k = sizeof(default_kmer_t) * 4 - 1; - /* max *odd* size that can be packed into 64 bits */ constexpr uint64_t max_m = 31; diff --git a/include/dictionary.hpp b/include/dictionary.hpp index d70f4a0..e1d44ae 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -15,7 +15,7 @@ struct streaming_query_canonical_parsing; template struct streaming_query_regular_parsing; -template +template struct dictionary { dictionary() : m_size(0), m_seed(0), m_k(0), m_m(0), m_canonical_parsing(0) {} @@ -48,15 +48,15 @@ struct dictionary { uint64_t contig_size(uint64_t contig_id) const; /* Navigational queries. */ - neighbourhood kmer_forward_neighbours(char const* string_kmer) const; - neighbourhood kmer_forward_neighbours(kmer_t uint_kmer) const; - neighbourhood kmer_backward_neighbours(char const* string_kmer) const; - neighbourhood kmer_backward_neighbours(kmer_t uint_kmer) const; + neighbourhood kmer_forward_neighbours(char const* string_kmer) const; + neighbourhood kmer_forward_neighbours(kmer_t uint_kmer) const; + neighbourhood kmer_backward_neighbours(char const* string_kmer) const; + neighbourhood kmer_backward_neighbours(kmer_t uint_kmer) const; /* forward and backward */ - neighbourhood kmer_neighbours(char const* string_kmer) const; - neighbourhood kmer_neighbours(kmer_t uint_kmer) const; - neighbourhood contig_neighbours(uint64_t contig_id) const; + neighbourhood kmer_neighbours(char const* string_kmer) const; + neighbourhood kmer_neighbours(kmer_t uint_kmer) const; + neighbourhood contig_neighbours(uint64_t contig_id) const; /* Return the weight of the kmer given its id. */ uint64_t weight(uint64_t kmer_id) const; @@ -126,8 +126,10 @@ struct dictionary { lookup_result lookup_uint_regular_parsing(kmer_t uint_kmer) const; lookup_result lookup_uint_canonical_parsing(kmer_t uint_kmer) const; - void forward_neighbours(kmer_t suffix, neighbourhood& res) const; - void backward_neighbours(kmer_t prefix, neighbourhood& res) const; + void forward_neighbours(kmer_t suffix, neighbourhood& res) const; + void backward_neighbours(kmer_t prefix, neighbourhood& res) const; + kmer_t get_prefix(kmer_t kmer) const; + kmer_t get_suffix(kmer_t kmer) const; }; } // namespace sshash diff --git a/include/dictionary.impl b/include/dictionary.impl index 708db7f..8ad4bfc 100644 --- a/include/dictionary.impl +++ b/include/dictionary.impl @@ -2,8 +2,8 @@ namespace sshash { template lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) const { - uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); - uint64_t bucket_id = m_minimizers.lookup(minimizer); + auto minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); + uint64_t bucket_id = m_minimizers.lookup(uint64_t(minimizer)); if (m_skew_index.empty()) return m_buckets.lookup(bucket_id, uint_kmer, m_k, m_m); @@ -24,10 +24,10 @@ lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) template lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const { - kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, m_k); - uint64_t minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); - uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, m_k, m_m, m_seed); - uint64_t bucket_id = m_minimizers.lookup(std::min(minimizer, minimizer_rc)); + kmer_t uint_kmer_rc = uint_kmer.reverse_complement(m_k); + auto minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); + auto minimizer_rc = util::compute_minimizer(uint_kmer_rc, m_k, m_m, m_seed); + uint64_t bucket_id = m_minimizers.lookup(uint64_t(std::min(minimizer, minimizer_rc))); if (m_skew_index.empty()) { return m_buckets.lookup_canonical(bucket_id, uint_kmer, uint_kmer_rc, m_k, m_m); @@ -79,7 +79,7 @@ lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, auto res = lookup_uint_regular_parsing(uint_kmer); assert(res.kmer_orientation == constants::forward_orientation); if (check_reverse_complement and res.kmer_id == constants::invalid_uint64) { - kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, m_k); + kmer_t uint_kmer_rc = uint_kmer.reverse_complement(m_k); res = lookup_uint_regular_parsing(uint_kmer_rc); res.kmer_orientation = constants::backward_orientation; } @@ -116,72 +116,82 @@ uint64_t dictionary::contig_size(uint64_t contig_id) const { } template -void dictionary::forward_neighbours(kmer_t suffix, neighbourhood& res) const { - res.forward_A = - lookup_advanced_uint(suffix + (util::char_to_uint('A') << (2 * (m_k - 1)))); - res.forward_C = - lookup_advanced_uint(suffix + (util::char_to_uint('C') << (2 * (m_k - 1)))); - res.forward_G = - lookup_advanced_uint(suffix + (util::char_to_uint('G') << (2 * (m_k - 1)))); - res.forward_T = - lookup_advanced_uint(suffix + (util::char_to_uint('T') << (2 * (m_k - 1)))); +void dictionary::forward_neighbours(kmer_t suffix, neighbourhood& res) const { + for (size_t i = 0; i < kmer_t::alphabet_size; i++) { + kmer_t new_kmer = suffix; + new_kmer.add_kth_char(m_k - 1, kmer_t::alphabet[i]); + res.forward[i] = lookup_advanced_uint(new_kmer); + } } template -void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res) const { - res.backward_A = lookup_advanced_uint(prefix + util::char_to_uint('A')); - res.backward_C = lookup_advanced_uint(prefix + util::char_to_uint('C')); - res.backward_G = lookup_advanced_uint(prefix + util::char_to_uint('G')); - res.backward_T = lookup_advanced_uint(prefix + util::char_to_uint('T')); +void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res) const { + for (size_t i = 0; i < kmer_t::alphabet_size; i++) { + kmer_t new_kmer = prefix; + new_kmer.add_kth_char(0, kmer_t::alphabet[i]); + res.backward[i] = lookup_advanced_uint(new_kmer); + } } template -neighbourhood dictionary::kmer_forward_neighbours(char const* string_kmer) const { +neighbourhood dictionary::kmer_forward_neighbours(char const* string_kmer) const { kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return kmer_forward_neighbours(uint_kmer); } + +template +kmer_t dictionary::get_suffix(kmer_t kmer) const { + kmer_t suffix = kmer; + suffix.drop_char(); + return suffix; +} template -neighbourhood dictionary::kmer_forward_neighbours(kmer_t uint_kmer) const { - neighbourhood res; - kmer_t suffix = uint_kmer >> 2; - forward_neighbours(suffix, res); +neighbourhood dictionary::kmer_forward_neighbours(kmer_t uint_kmer) const { + neighbourhood res; + forward_neighbours(get_suffix(uint_kmer), res); return res; } template -neighbourhood dictionary::kmer_backward_neighbours(char const* string_kmer) const { +neighbourhood dictionary::kmer_backward_neighbours(char const* string_kmer) const { kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return kmer_backward_neighbours(uint_kmer); } + template -neighbourhood dictionary::kmer_backward_neighbours(kmer_t uint_kmer) const { - neighbourhood res; - kmer_t prefix = (uint_kmer << 2) & ((kmer_t(1) << (2 * m_k)) - 1); - backward_neighbours(prefix, res); +kmer_t dictionary::get_prefix(kmer_t kmer) const { + kmer_t prefix = kmer; + prefix.pad_char(); + prefix.take_chars(m_k); + return prefix; +} +template +neighbourhood dictionary::kmer_backward_neighbours(kmer_t uint_kmer) const { + neighbourhood res; + backward_neighbours(get_prefix(uint_kmer), res); return res; } template -neighbourhood dictionary::kmer_neighbours(char const* string_kmer) const { +neighbourhood dictionary::kmer_neighbours(char const* string_kmer) const { kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k); return kmer_neighbours(uint_kmer); } template -neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer) const { - neighbourhood res; - kmer_t suffix = uint_kmer >> 2; - forward_neighbours(suffix, res); - kmer_t prefix = (uint_kmer << 2) & ((kmer_t(1) << (2 * m_k)) - 1); - backward_neighbours(prefix, res); +neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer) const { + neighbourhood res; + forward_neighbours(get_suffix(uint_kmer), res); + backward_neighbours(get_prefix(uint_kmer), res); return res; } template -neighbourhood dictionary::contig_neighbours(uint64_t contig_id) const { +neighbourhood dictionary::contig_neighbours(uint64_t contig_id) const { assert(contig_id < num_contigs()); - neighbourhood res; + neighbourhood res; kmer_t suffix = m_buckets.contig_suffix(contig_id, m_k); forward_neighbours(suffix, res); - kmer_t prefix = m_buckets.contig_prefix(contig_id, m_k) << 2; + kmer_t prefix = m_buckets.contig_prefix(contig_id, m_k); + prefix.pad_char(); backward_neighbours(prefix, res); return res; } diff --git a/include/dump.impl b/include/dump.impl index 52846ef..cc9ce06 100644 --- a/include/dump.impl +++ b/include/dump.impl @@ -25,13 +25,13 @@ void dictionary::dump(std::string const& filename) const { (void)_; bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); uint64_t window_size = std::min(m_k - m_m + 1, contig_end - offset - m_k + 1); - uint64_t prev_minimizer = constants::invalid_uint64; + kmer_t prev_minimizer = constants::invalid_uint64; bool super_kmer_header_written = false; for (uint64_t w = 0; w != window_size; ++w) { - kmer_t kmer = bv_it.read_and_advance_by_two(2 * m_k); + kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k); auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); if (m_canonical_parsing) { - kmer_t kmer_rc = util::compute_reverse_complement(kmer, m_k); + kmer_t kmer_rc = kmer.reverse_complement(m_k); auto [minimizer_rc, pos_rc] = util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed); if (minimizer_rc < minimizer) { @@ -52,7 +52,9 @@ void dictionary::dump(std::string const& filename) const { if (minimizer != prev_minimizer) { break; } else { - out << util::uint64_to_char(kmer >> (2 * (m_k - 1))); + kmer_t last_char = kmer; + last_char.drop_chars(m_k - 1); + out << kmer_t::uint64_to_char(uint64_t(last_char)); } } prev_minimizer = minimizer; diff --git a/include/hash_util.hpp b/include/hash_util.hpp index ddc94ec..9607f83 100644 --- a/include/hash_util.hpp +++ b/include/hash_util.hpp @@ -11,10 +11,10 @@ struct kmers_pthash_hasher_64 { /* specialization for kmer_t */ static inline pthash::hash64 hash(kmer_t x, uint64_t seed) { - if constexpr (constants::uint_kmer_bits == 64) { + if constexpr (kmer_t::uint_kmer_bits == 64) { return pthash::MurmurHash2_64(reinterpret_cast(&x), sizeof(x), seed); } else { - assert(constants::uint_kmer_bits == 128); + assert(kmer_t::uint_kmer_bits == 128); uint64_t low = static_cast(x); uint64_t high = static_cast(x >> 64); uint64_t hash = @@ -31,11 +31,11 @@ struct kmers_pthash_hasher_128 { /* specialization for kmer_t */ static inline pthash::hash128 hash(kmer_t x, uint64_t seed) { - if constexpr (constants::uint_kmer_bits == 64) { + if constexpr (kmer_t::uint_kmer_bits == 64) { return {pthash::MurmurHash2_64(reinterpret_cast(&x), sizeof(x), seed), pthash::MurmurHash2_64(reinterpret_cast(&x), sizeof(x), ~seed)}; } else { - assert(constants::uint_kmer_bits == 128); + assert(kmer_t::uint_kmer_bits == 128); uint64_t low = static_cast(x); uint64_t high = static_cast(x >> 64); return { diff --git a/include/kmer.hpp b/include/kmer.hpp index c405828..4271801 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -2,8 +2,151 @@ namespace sshash { -using default_kmer_t = uint64_t; -// typedef uint64_t kmer_t; -// typedef __uint128_t kmer_t; +template +struct uint_kmer_t { + using uint_t = Int; + Int kmer; -} // namespace sshash \ No newline at end of file + uint_kmer_t() {} + uint_kmer_t(Int kmer) : kmer(kmer) {} + + explicit operator Int() const { return kmer; } + + // TODO: change to <=> when switching to C++20 + bool operator==(uint_kmer_t const& t) const { return kmer == t.kmer; } + bool operator!=(uint_kmer_t const& t) const { return kmer != t.kmer; } + bool operator<(uint_kmer_t const& t) const { return kmer < t.kmer; } + + // append b bits + void pad(uint16_t b) { kmer <<= b; } + + // append zero character + void pad_char() { kmer <<= bits_per_char; } + + // remove first b bits + void drop(uint16_t b) { kmer >>= b; } + + // remove first 64 bits + void drop64() { drop(64); } + + // remove first encoded character + void drop_char() { drop(bits_per_char); } + // remove first k encoded characters + void drop_chars(uint16_t k) { drop(k * bits_per_char); } + + // remove everything except first b bits + void take(uint16_t b) { kmer &= (Int(1) << b) - 1; } + // remove everything except first 64 bits + void take64() { take(64); } + + // remove everything except first character + void take_char() { take(bits_per_char); } + + // remove everything except first m character + void take_chars(uint16_t k) { take(k * bits_per_char); } + + // add a block of b bits in the front + void append_bits(uint64_t n, uint16_t b) { + pad(b); + kmer |= n; + } + + // add a block of 64 bits in the front + void append64(uint64_t n) { append_bits(n, 64); } + + // add a single character in the front + void append_char(uint64_t c) { append_bits(c, bits_per_char); } + + // assigns a character at k-th position + // assumes that the position is empty + // use append_char instead, if possible + void add_kth_char(uint16_t k, uint64_t c) { kmer |= Int(c) << (k * bits_per_char); } + + // total number of bits used to store a k-mer + static constexpr uint16_t uint_kmer_bits = sizeof(Int) * 8; + // number of bits dedicated to a single character + static constexpr uint8_t bits_per_char = BitsPerChar; + // max odd size that can be packed into uint_kmer_bits bits + static constexpr uint16_t max_k = []() { + uint16_t max_k_any = uint_kmer_bits / bits_per_char; + return max_k_any % 2 == 0 ? max_k_any - 1 : max_k_any; + }(); + + static_assert(uint_kmer_bits % 64 == 0, "Int must use 64*k bits"); + static_assert(bits_per_char <= 64, "At most 64 bits per character"); +}; + +template +struct alpha_kmer_t : uint_kmer_t { + using base = uint_kmer_t; + using base::base; + static constexpr char const* alphabet = Alphabet; + static constexpr uint8_t alphabet_size = strlen(Alphabet); + + static uint64_t char_to_uint(char c); + + static char uint64_to_char(uint64_t x) { return alphabet[x]; } +}; + +constexpr char dna_alphabet[] = "ACTG"; + +template +struct dna_uint_kmer_t : alpha_kmer_t { + using base = alpha_kmer_t; + using base::uint_kmer_bits; + using base::base; + /* + This works with the map: + A -> 00; C -> 01; G -> 11; T -> 10. + + Example. + reverse_complement("ACTCACG") = CGTGAGT, in binary: + reverse_complement("00.01.10.01.00.01.11") = 01.11.10.11.00.11.10. + */ + [[maybe_unused]] static uint64_t crc64(uint64_t x) { + /* Complement, swap byte order */ + uint64_t res = __builtin_bswap64(x ^ 0xaaaaaaaaaaaaaaaa); + + /* Swap nuc order in bytes */ + const uint64_t c1 = 0x0f0f0f0f0f0f0f0f; + const uint64_t c2 = 0x3333333333333333; + res = ((res & c1) << 4) | ((res & (c1 << 4)) >> 4); // swap 2-nuc order in bytes + res = ((res & c2) << 2) | ((res & (c2 << 2)) >> 2); // swap nuc order in 2-nuc + + return res; + } + + [[maybe_unused]] dna_uint_kmer_t reverse_complement(uint64_t k) { + dna_uint_kmer_t x(*this); + assert(k <= max_k); + dna_uint_kmer_t res(0); + for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { + auto block = x; + block.take64(); + res.append64(crc64(uint64_t(block))); + x.drop64(); + } + // res is full reverse-complement to x + res.drop(uint_kmer_bits - k); + res.take_chars(k); + return res; + } + + /* + char decimal binary + A 65 01000-00-1 -> 00 + C 67 01000-01-1 -> 01 + G 71 01000-11-1 -> 11 + T 84 01010-10-0 -> 10 + + a 97 01100-00-1 -> 00 + c 99 01100-01-1 -> 01 + g 103 01100-11-1 -> 11 + t 116 01110-10-0 -> 10 + */ + static uint64_t char_to_uint(char c) { return (c >> 1) & 3; } +}; + +using default_kmer_t = dna_uint_kmer_t; + +} // namespace sshash diff --git a/include/minimizer_enumerator.hpp b/include/minimizer_enumerator.hpp index 1683794..a6819e4 100644 --- a/include/minimizer_enumerator.hpp +++ b/include/minimizer_enumerator.hpp @@ -57,34 +57,31 @@ struct minimizer_enumerator { , m_m(m) , m_seed(seed) , m_position(0) - , m_mask((kmer_t(1) << (2 * m_m)) - 1) , m_q(k - m + 1) /* deque cannot contain more than k - m + 1 elements */ {} uint64_t next(kmer_t kmer, bool clear, bool reverse = false) { if (clear) { - if (reverse) { - for (uint64_t i = 0; i != m_k - m_m + 1; ++i) { - uint64_t mmer = static_cast((kmer >> (2 * (m_k - m_m - i))) & m_mask); - eat(mmer); - } - } else { - for (uint64_t i = 0; i != m_k - m_m + 1; ++i) { - uint64_t mmer = static_cast(kmer & m_mask); - kmer >>= 2; - eat(mmer); + for (uint64_t i = 0; i != m_k - m_m + 1; ++i) { + kmer_t mmer = kmer; + if (reverse) { + mmer.drop_chars(m_k - m_m - i); + } else { + kmer.drop_char(); } + mmer.take_chars(m_m); + eat(uint64_t(mmer)); } } else { + kmer_t mmer = kmer; if (reverse) { - uint64_t mmer = static_cast(kmer & m_mask); - eat(mmer); + mmer.take_chars(m_m); } else { - uint64_t mmer = static_cast(kmer >> (2 * (m_k - m_m))); - eat(mmer); + mmer.drop_chars(m_k - m_m); } + eat(uint64_t(mmer)); } - return m_q.front().value; + return uint64_t(m_q.front().value); } private: @@ -96,9 +93,10 @@ struct minimizer_enumerator { struct mmer_t { mmer_t() {} - mmer_t(uint64_t hash, uint64_t position, uint64_t value) + mmer_t(uint64_t hash, uint64_t position, kmer_t value) : hash(hash), position(position), value(value) {} - uint64_t hash, position, value; + uint64_t hash, position; + kmer_t value; }; /* NOTE: we could use a std::deque here, diff --git a/include/query/streaming_query.hpp b/include/query/streaming_query.hpp index 128cb1a..f7ef41c 100644 --- a/include/query/streaming_query.hpp +++ b/include/query/streaming_query.hpp @@ -43,7 +43,8 @@ streaming_query_report streaming_query_from_fasta_file_multiline(dictionary -streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, std::istream& is) { +streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, + std::istream& is) { streaming_query_report report; std::string line; uint64_t k = dict->k(); @@ -66,7 +67,8 @@ streaming_query_report streaming_query_from_fasta_file(dictionary const* } template -streaming_query_report streaming_query_from_fastq_file(dictionary const* dict, std::istream& is) { +streaming_query_report streaming_query_from_fastq_file(dictionary const* dict, + std::istream& is) { streaming_query_report report; std::string line; uint64_t k = dict->k(); @@ -93,15 +95,15 @@ streaming_query_report streaming_query_from_fastq_file(dictionary const* } template -streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, std::istream& is, - bool multiline) { +streaming_query_report streaming_query_from_fasta_file(dictionary const* dict, + std::istream& is, bool multiline) { if (multiline) return streaming_query_from_fasta_file_multiline(dict, is); return streaming_query_from_fasta_file(dict, is); } -template +template streaming_query_report dictionary::streaming_query_from_file(std::string const& filename, - bool multiline) const { + bool multiline) const { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); streaming_query_report report; @@ -110,11 +112,13 @@ streaming_query_report dictionary::streaming_query_from_file(std::string zip_istream zis(is); if (canonicalized()) { - report = streaming_query_from_fasta_file>(this, zis, - multiline); + report = + streaming_query_from_fasta_file>( + this, zis, multiline); } else { - report = streaming_query_from_fasta_file>(this, zis, - multiline); + report = + streaming_query_from_fasta_file>( + this, zis, multiline); } } else if (util::ends_with(filename, ".fq.gz") or util::ends_with(filename, ".fastq.gz")) { @@ -125,18 +129,24 @@ streaming_query_report dictionary::streaming_query_from_file(std::string zip_istream zis(is); if (canonicalized()) { - report = streaming_query_from_fastq_file>(this, zis); + report = + streaming_query_from_fastq_file>( + this, zis); } else { - report = streaming_query_from_fastq_file>(this, zis); + report = + streaming_query_from_fastq_file>( + this, zis); } } else if (util::ends_with(filename, ".fa") or util::ends_with(filename, ".fasta")) { if (canonicalized()) { - report = streaming_query_from_fasta_file>(this, is, - multiline); + report = + streaming_query_from_fasta_file>( + this, is, multiline); } else { - report = streaming_query_from_fasta_file>(this, is, - multiline); + report = + streaming_query_from_fasta_file>( + this, is, multiline); } } else if (util::ends_with(filename, ".fq") or util::ends_with(filename, ".fastq")) { @@ -145,9 +155,13 @@ streaming_query_report dictionary::streaming_query_from_file(std::string << std::endl; } if (canonicalized()) { - report = streaming_query_from_fastq_file>(this, is); + report = + streaming_query_from_fastq_file>( + this, is); } else { - report = streaming_query_from_fastq_file>(this, is); + report = + streaming_query_from_fastq_file>( + this, is); } } else { diff --git a/include/query/streaming_query_canonical_parsing.hpp b/include/query/streaming_query_canonical_parsing.hpp index bf122be..7897a40 100644 --- a/include/query/streaming_query_canonical_parsing.hpp +++ b/include/query/streaming_query_canonical_parsing.hpp @@ -6,7 +6,7 @@ namespace sshash { -template +template struct streaming_query_canonical_parsing { streaming_query_canonical_parsing(dictionary const* dict) @@ -18,7 +18,7 @@ struct streaming_query_canonical_parsing { , m_prev_minimizer(constants::invalid_uint64) , m_kmer(constants::invalid_uint64) - , m_shift(2 * (dict->m_k - 1)) + , m_shift(dict->m_k - 1) , m_k(dict->m_k) , m_m(dict->m_m) , m_seed(dict->m_seed) @@ -54,19 +54,19 @@ struct streaming_query_canonical_parsing { /* 2. compute kmer and minimizer */ if (!m_start) { - m_kmer >>= 2; - m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift; + m_kmer.drop_char(); + m_kmer.add_kth_char(m_shift, kmer_t::char_to_uint(kmer[m_k - 1])); assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); } else { m_kmer = util::string_to_uint_kmer(kmer, m_k); } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = util::compute_reverse_complement(m_kmer, m_k); + m_kmer_rc = m_kmer.reverse_complement(m_k); constexpr bool reverse = true; uint64_t minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); assert(minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); - m_curr_minimizer = std::min(m_curr_minimizer, minimizer_rc); + m_curr_minimizer = std::min(m_curr_minimizer, minimizer_rc); /* 3. compute result */ if (m_start) { @@ -174,10 +174,10 @@ struct streaming_query_canonical_parsing { kmer_t val = m_string_iterator.read(2 * m_k); if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { - kmer_t val_rc = util::compute_reverse_complement(val, m_k); + kmer_t val_rc = val.reverse_complement(m_k); uint64_t minimizer = - std::min(util::compute_minimizer(val, m_k, m_m, m_seed), - util::compute_minimizer(val_rc, m_k, m_m, m_seed)); + std::min(util::compute_minimizer(val, m_k, m_m, m_seed), + util::compute_minimizer(val_rc, m_k, m_m, m_seed)); if (minimizer != m_curr_minimizer) { m_minimizer_not_found = true; m_res = lookup_result(); diff --git a/include/query/streaming_query_regular_parsing.hpp b/include/query/streaming_query_regular_parsing.hpp index 2d13ff1..42527a9 100644 --- a/include/query/streaming_query_regular_parsing.hpp +++ b/include/query/streaming_query_regular_parsing.hpp @@ -6,7 +6,7 @@ namespace sshash { -template +template struct streaming_query_regular_parsing { streaming_query_regular_parsing(dictionary const* dict) @@ -26,7 +26,7 @@ struct streaming_query_regular_parsing { , m_kmer(constants::invalid_uint64) - , m_shift(2 * (dict->m_k - 1)) + , m_shift(dict->m_k - 1) , m_k(dict->m_k) , m_m(dict->m_m) , m_seed(dict->m_seed) @@ -58,15 +58,15 @@ struct streaming_query_regular_parsing { /* 2. compute kmer and minimizer */ if (!m_start) { - m_kmer >>= 2; - m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift; + m_kmer.drop_char(); + m_kmer.add_kth_char(m_shift, kmer_t::char_to_uint(kmer[m_k - 1])); assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); } else { m_kmer = util::string_to_uint_kmer(kmer, m_k); } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = util::compute_reverse_complement(m_kmer, m_k); + m_kmer_rc = m_kmer.reverse_complement(m_k); constexpr bool reverse = true; m_curr_minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); assert(m_curr_minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); @@ -227,7 +227,7 @@ struct streaming_query_regular_parsing { kmer_t val = m_string_iterator.read(2 * m_k); if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { - uint64_t minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); + auto minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); if (minimizer != m_curr_minimizer) { m_minimizer_not_found = true; m_res = lookup_result(); @@ -269,7 +269,7 @@ struct streaming_query_regular_parsing { kmer_t val = m_string_iterator.read(2 * m_k); if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { - uint64_t minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); + auto minimizer = util::compute_minimizer(val, m_k, m_m, m_seed); if (minimizer != m_curr_minimizer_rc) { m_minimizer_rc_not_found = true; m_res = lookup_result(); diff --git a/include/statistics.impl b/include/statistics.impl index 61b931e..e9f59fe 100644 --- a/include/statistics.impl +++ b/include/statistics.impl @@ -22,13 +22,13 @@ void dictionary::compute_statistics() const { (void)_; bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); uint64_t window_size = std::min(m_k - m_m + 1, contig_end - offset - m_k + 1); - uint64_t prev_minimizer = constants::invalid_uint64; + kmer_t prev_minimizer = constants::invalid_uint64; uint64_t w = 0; for (; w != window_size; ++w) { - uint64_t kmer = bv_it.read_and_advance_by_two(2 * m_k); + kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k); auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); if (m_canonical_parsing) { - uint64_t kmer_rc = util::compute_reverse_complement(kmer, m_k); + kmer_t kmer_rc = kmer.reverse_complement(m_k); auto [minimizer_rc, pos_rc] = util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed); if (minimizer_rc < minimizer) { diff --git a/include/util.hpp b/include/util.hpp index 87ebb41..5f44e43 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -32,17 +32,11 @@ struct lookup_result { uint32_t contig_size; }; +template struct neighbourhood { /* forward */ - lookup_result forward_A; - lookup_result forward_C; - lookup_result forward_G; - lookup_result forward_T; - /* backward */ - lookup_result backward_A; - lookup_result backward_C; - lookup_result backward_G; - lookup_result backward_T; + std::array forward; + std::array backward; }; [[maybe_unused]] static bool equal_lookup_result(lookup_result expected, lookup_result got) { @@ -127,110 +121,34 @@ static inline uint32_t ceil_log2_uint32(uint32_t x) { return (x > 1) ? msb(x - 1 return std::equal(pattern.begin(), pattern.end(), str.end() - pattern.size()); } -/* -char decimal binary - A 65 01000-00-1 -> 00 - C 67 01000-01-1 -> 01 - G 71 01000-11-1 -> 11 - T 84 01010-10-0 -> 10 - - a 97 01100-00-1 -> 00 - c 99 01100-01-1 -> 01 - g 103 01100-11-1 -> 11 - t 116 01110-10-0 -> 10 -*/ -template -static kmer_t char_to_uint(char c) { - return (c >> 1) & 3; -} - -static char uint64_to_char(uint64_t x) { - assert(x <= 3); - static char nucleotides[4] = {'A', 'C', 'T', 'G'}; - return nucleotides[x]; -} - template [[maybe_unused]] static kmer_t string_to_uint_kmer(char const* str, uint64_t k) { - assert(k <= constants::max_k); + assert(k <= kmer_t::max_k); kmer_t x = 0; - for (uint64_t i = 0; i != k; ++i) x += char_to_uint(str[i]) << (2 * i); + for (int i = k - 1; i >= 0; i--) { x.append_char(kmer_t::char_to_uint(str[i])); } return x; } template static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) { - assert(k <= constants::max_k); + assert(k <= kmer_t::max_k); for (uint64_t i = 0; i != k; ++i) { - str[i] = uint64_to_char(x & 3); - x >>= 2; + auto first_char = x; + first_char.take_char(); + str[i] = kmer_t::uint64_to_char(uint64_t(first_char)); + x.drop_char(); } } template [[maybe_unused]] static std::string uint_kmer_to_string(kmer_t x, uint64_t k) { - assert(k <= constants::max_k); + assert(k <= kmer_t::max_k); std::string str; str.resize(k); uint_kmer_to_string(x, str.data(), k); return str; } -/* - This works with the map: - A -> 00; C -> 01; G -> 11; T -> 10. - - Example. - reverse_complement("ACTCACG") = CGTGAGT, in binary: - reverse_complement("00.01.10.01.00.01.11") = 01.11.10.11.00.11.10. -*/ -template -[[maybe_unused]] static uint64_t crc(uint64_t x, uint64_t k) { - assert(k <= 32); - - /* Complement, swap byte order */ - uint64_t res = __builtin_bswap64(x ^ 0xaaaaaaaaaaaaaaaa); - - /* Swap nuc order in bytes */ - const uint64_t c1 = 0x0f0f0f0f0f0f0f0f; - const uint64_t c2 = 0x3333333333333333; - res = ((res & c1) << 4) | ((res & (c1 << 4)) >> 4); // swap 2-nuc order in bytes - res = ((res & c2) << 2) | ((res & (c2 << 2)) >> 2); // swap nuc order in 2-nuc - - /* Realign to the right */ - if constexpr (align) res >>= 64 - 2 * k; - - return res; -} - -template -[[maybe_unused]] static kmer_t compute_reverse_complement(kmer_t x, uint64_t k) { - assert(k <= constants::max_k); - if constexpr (constants::uint_kmer_bits == 64) { - return crc(x, k); - } else { - assert(constants::uint_kmer_bits == 128); - uint64_t low = static_cast(x); - uint64_t high = static_cast(x >> 64); - uint64_t k_low = 32; - uint64_t k_high = k - 32; - uint64_t shift = 128 - 2 * k; - if (k < 32) { - k_low = k; - k_high = 0; - shift = 64; - } - kmer_t low_rc = crc(low, k_low); - kmer_t high_rc = crc(high, k_high); - kmer_t res = (low_rc << 64) + high_rc; - - /* Realign to the right */ - res >>= shift; - - return res; - } -} - /* Forward character map: A -> A 65 @@ -296,43 +214,43 @@ static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c] } template -uint64_t compute_minimizer(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { +kmer_t compute_minimizer(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { assert(m <= constants::max_m); assert(m <= k); uint64_t min_hash = uint64_t(-1); - uint64_t minimizer = uint64_t(-1); - kmer_t mask = (kmer_t(1) << (2 * m)) - 1; + kmer_t minimizer = kmer_t(-1); for (uint64_t i = 0; i != k - m + 1; ++i) { - uint64_t mmer = static_cast(kmer & mask); - uint64_t hash = Hasher::hash(mmer, seed); + kmer_t mmer = kmer; + mmer.take_chars(m); + uint64_t hash = Hasher::hash(uint64_t(mmer), seed); if (hash < min_hash) { min_hash = hash; minimizer = mmer; } - kmer >>= 2; + kmer.drop_char(); } return minimizer; } /* used in dump.cpp */ template -std::pair compute_minimizer_pos(kmer_t kmer, uint64_t k, uint64_t m, - uint64_t seed) { +std::pair compute_minimizer_pos(kmer_t kmer, uint64_t k, uint64_t m, + uint64_t seed) { assert(m <= constants::max_m); assert(m <= k); uint64_t min_hash = uint64_t(-1); - uint64_t minimizer = uint64_t(-1); - kmer_t mask = (kmer_t(1) << (2 * m)) - 1; + kmer_t minimizer = kmer_t(-1); uint64_t pos = 0; for (uint64_t i = 0; i != k - m + 1; ++i) { - uint64_t mmer = static_cast(kmer & mask); - uint64_t hash = Hasher::hash(mmer, seed); + kmer_t mmer = kmer; + mmer.take_chars(m); + uint64_t hash = Hasher::hash(uint64_t(mmer), seed); if (hash < min_hash) { min_hash = hash; minimizer = mmer; pos = i; } - kmer >>= 2; + kmer.drop_char(); } return {minimizer, pos}; } diff --git a/src/bench_utils.hpp b/src/bench_utils.hpp index c7f70ba..f32dd18 100644 --- a/src/bench_utils.hpp +++ b/src/bench_utils.hpp @@ -2,7 +2,7 @@ namespace sshash { -template +template void perf_test_iterator(dictionary const& dict) { essentials::timer t; t.start(); @@ -17,7 +17,7 @@ void perf_test_iterator(dictionary const& dict) { std::cout << "iterator: avg_nanosec_per_kmer " << avg_nanosec << std::endl; } -template +template void perf_test_lookup_access(dictionary const& dict) { constexpr uint64_t num_queries = 1000000; constexpr uint64_t runs = 5; @@ -140,7 +140,7 @@ void perf_test_lookup_access(dictionary const& dict) { } } -template +template void perf_test_lookup_weight(dictionary const& dict) { if (!dict.weighted()) { std::cerr << "ERROR: the dictionary does not store weights" << std::endl; diff --git a/src/build.cpp b/src/build.cpp index 4c5b9d4..8fe5a1a 100644 --- a/src/build.cpp +++ b/src/build.cpp @@ -1,5 +1,6 @@ using namespace sshash; +template int build(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); @@ -10,8 +11,7 @@ int build(int argc, char** argv) { "\t- one DNA sequence per line.\n" "\tFor example, it could be the de Bruijn graph topology output by BCALM.", "-i", true); - parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ").", "-k", - true); + parser.add("k", "K-mer length (must be <= " + std::to_string(kmer_t::max_k) + ").", "-k", true); parser.add("m", "Minimizer length (must be < k).", "-m", true); /* Optional arguments. */ @@ -53,7 +53,7 @@ int build(int argc, char** argv) { auto k = parser.get("k"); auto m = parser.get("m"); - dictionary dict; + dictionary dict; build_configuration build_config; build_config.k = k; diff --git a/src/check_utils.hpp b/src/check_utils.hpp index fa2aa9c..807369f 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -6,7 +6,7 @@ namespace sshash { -template +template bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); uint64_t n = dict.size(); @@ -51,7 +51,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& /* transform 50% of the kmers into their reverse complements */ if ((num_kmers & 1) == 0) { - uint_kmer = util::compute_reverse_complement(uint_kmer, k); + uint_kmer = uint_kmer.reverse_complement(k); orientation = constants::backward_orientation; } @@ -140,7 +140,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& // check access dict.access(id, got_kmer_str.data()); kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k); - kmer_t got_uint_kmer_rc = util::compute_reverse_complement(got_uint_kmer, k); + kmer_t got_uint_kmer_rc = got_uint_kmer.reverse_complement(k); if (got_uint_kmer != uint_kmer and got_uint_kmer_rc != uint_kmer) { std::cout << "ERROR: got '" << got_kmer_str << "' but expected '" << expected_kmer_str << "'" << std::endl; @@ -177,7 +177,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& return true; } -template +template bool check_correctness_navigational_kmer_query(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); std::string line; @@ -197,64 +197,24 @@ bool check_correctness_navigational_kmer_query(std::istream& is, dictionary curr = dict.kmer_neighbours(line.data() + i); char next_nuc = line[i + k]; - switch (next_nuc) { - case 'A': - if (curr.forward_A.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_A" << std::endl; - } - assert(curr.forward_A.kmer_id != constants::invalid_uint64); - break; - case 'C': - if (curr.forward_C.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_C" << std::endl; - } - assert(curr.forward_C.kmer_id != constants::invalid_uint64); - break; - case 'G': - if (curr.forward_G.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_G" << std::endl; - } - assert(curr.forward_G.kmer_id != constants::invalid_uint64); - break; - case 'T': - if (curr.forward_T.kmer_id == constants::invalid_uint64) { - std::cout << "expected forward_T" << std::endl; - } - assert(curr.forward_T.kmer_id != constants::invalid_uint64); - break; + bool next_nuc_not_found = + curr.forward[kmer_t::char_to_uint(next_nuc)].kmer_id == constants::invalid_uint64; + if (next_nuc_not_found) { + std::cout << "expected forward[" << next_nuc << "]" << std::endl; } + assert(!next_nuc_not_found); if (i != 0) { char prev_nuc = line[i - 1]; - switch (prev_nuc) { - case 'A': - if (curr.backward_A.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_A" << std::endl; - } - assert(curr.backward_A.kmer_id != constants::invalid_uint64); - break; - case 'C': - if (curr.backward_C.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_C" << std::endl; - } - assert(curr.backward_C.kmer_id != constants::invalid_uint64); - break; - case 'G': - if (curr.backward_G.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_G" << std::endl; - } - assert(curr.backward_G.kmer_id != constants::invalid_uint64); - break; - case 'T': - if (curr.backward_T.kmer_id == constants::invalid_uint64) { - std::cout << "expected backward_T" << std::endl; - } - assert(curr.backward_T.kmer_id != constants::invalid_uint64); - break; + bool prev_nuc_not_found = curr.backward[kmer_t::char_to_uint(prev_nuc)].kmer_id == + constants::invalid_uint64; + if (prev_nuc_not_found) { + std::cout << "expected backward[" << prev_nuc << "]" << std::endl; } + assert(!prev_nuc_not_found); } ++num_kmers; @@ -273,7 +233,7 @@ bool check_correctness_navigational_kmer_query(std::istream& is, dictionary +template bool check_correctness_navigational_contig_query(dictionary const& dict) { std::cout << "checking correctness of navigational queries for contigs..." << std::endl; uint64_t num_contigs = dict.num_contigs(); @@ -292,19 +252,16 @@ bool check_correctness_navigational_contig_query(dictionary const& dict) uint64_t begin_kmer_id = kmer_id; dict.access(begin_kmer_id, kmer.data()); auto backward = dict.kmer_backward_neighbours(kmer.data()); - equal_lookup_result(backward.backward_A, res.backward_A); - equal_lookup_result(backward.backward_C, res.backward_C); - equal_lookup_result(backward.backward_G, res.backward_G); - equal_lookup_result(backward.backward_T, res.backward_T); + for (size_t i = 0; i < kmer_t::alphabet_size; i++) { + equal_lookup_result(backward.backward[i], res.backward[i]); + } uint64_t end_kmer_id = kmer_id + contig_size - 1; dict.access(end_kmer_id, kmer.data()); auto forward = dict.kmer_forward_neighbours(kmer.data()); - equal_lookup_result(forward.forward_A, res.forward_A); - equal_lookup_result(forward.forward_C, res.forward_C); - equal_lookup_result(forward.forward_G, res.forward_G); - equal_lookup_result(forward.forward_T, res.forward_T); - + for (size_t i = 0; i < kmer_t::alphabet_size; i++) { + equal_lookup_result(forward.forward[i], res.forward[i]); + } kmer_id += contig_size; } std::cout << "checked " << contig_id << " contigs" << std::endl; @@ -312,7 +269,7 @@ bool check_correctness_navigational_contig_query(dictionary const& dict) return true; } -template +template bool check_correctness_weights(std::istream& is, dictionary const& dict) { uint64_t k = dict.k(); std::string line; @@ -368,7 +325,7 @@ bool check_correctness_weights(std::istream& is, dictionary const& dict) The input file must be the one the index was built from. Throughout the code, we assume the input does not contain any duplicate. */ -template +template bool check_correctness_lookup_access(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); @@ -387,7 +344,7 @@ bool check_correctness_lookup_access(dictionary const& dict, std::string The input file must be the one the index was built from. Throughout the code, we assume the input does not contain any duplicate. */ -template +template bool check_correctness_navigational_kmer_query(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); @@ -406,7 +363,7 @@ bool check_correctness_navigational_kmer_query(dictionary const& dict, /* The input file must be the one the index was built from. */ -template +template bool check_correctness_weights(dictionary const& dict, std::string const& filename) { std::ifstream is(filename.c_str()); if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'"); @@ -421,7 +378,7 @@ bool check_correctness_weights(dictionary const& dict, std::string const return good; } -template +template bool check_dictionary(dictionary const& dict) { uint64_t k = dict.k(); uint64_t n = dict.size(); @@ -450,7 +407,7 @@ bool check_dictionary(dictionary const& dict) { return true; } -template +template bool check_correctness_iterator(dictionary const& dict) { std::cout << "checking correctness of iterator..." << std::endl; std::string expected_kmer(dict.k(), 0); diff --git a/src/common.hpp b/src/common.hpp index dcd07c9..13b2d7e 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -11,7 +11,7 @@ void random_kmer(char* kmer, uint64_t k) { for (uint64_t i = 0; i != k; ++i) kmer[i] = "ACGT"[rand() % 4]; } -template +template void load_dictionary(dictionary& dict, std::string const& index_filename, bool verbose) { uint64_t num_bytes_read = essentials::load(dict, index_filename.c_str()); if (verbose) { diff --git a/src/permute.cpp b/src/permute.cpp index 7a9f369..86ae745 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -3,6 +3,7 @@ using namespace sshash; +template int permute(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); @@ -16,8 +17,7 @@ int permute(int argc, char** argv) { "\tFor example, it could be the de Bruijn graph topology output " "by BCALM.", "-i", true); - parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ").", "-k", - true); + parser.add("k", "K-mer length (must be <= " + std::to_string(kmer_t::max_k) + ").", "-k", true); /* Optional arguments. */ parser.add("output_filename", "Output file where the permuted collection will be written.", @@ -38,8 +38,8 @@ int permute(int argc, char** argv) { std::cerr << "k must be > 0" << std::endl; return 1; } - if (k > constants::max_k) { - std::cerr << "k must be less <= " + std::to_string(constants::max_k) + + if (k > kmer_t::max_k) { + std::cerr << "k must be less <= " + std::to_string(kmer_t::max_k) + " but got k = " + std::to_string(k) << std::endl; return 1; diff --git a/src/query.cpp b/src/query.cpp index ebbb6d0..57377eb 100644 --- a/src/query.cpp +++ b/src/query.cpp @@ -3,6 +3,7 @@ using namespace sshash; +template int query(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -22,7 +23,7 @@ int query(int argc, char** argv) { bool verbose = parser.get("verbose"); bool multiline = parser.get("multiline"); - dictionary dict; + dictionary dict; load_dictionary(dict, index_filename, verbose); essentials::logger("performing queries from file '" + query_filename + "'..."); diff --git a/src/sshash.cpp b/src/sshash.cpp index 674b881..9767cee 100644 --- a/src/sshash.cpp +++ b/src/sshash.cpp @@ -9,8 +9,7 @@ using namespace sshash; -using kmer_t = default_kmer_t; - +template int check(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -25,6 +24,7 @@ int check(int argc, char** argv) { return 0; } +template int bench(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -40,6 +40,7 @@ int bench(int argc, char** argv) { return 0; } +template int dump(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -55,6 +56,7 @@ int dump(int argc, char** argv) { return 0; } +template int compute_statistics(int argc, char** argv) { cmd_line_parser::parser parser(argc, argv); parser.add("index_filename", "Must be a file generated with the tool 'build'.", "-i", true); @@ -88,19 +90,19 @@ int main(int argc, char** argv) { if (argc < 2) return help(argv[0]); auto tool = std::string(argv[1]); if (tool == "build") { - return build(argc - 1, argv + 1); + return build(argc - 1, argv + 1); } else if (tool == "query") { - return query(argc - 1, argv + 1); + return query(argc - 1, argv + 1); } else if (tool == "check") { - return check(argc - 1, argv + 1); + return check(argc - 1, argv + 1); } else if (tool == "bench") { - return bench(argc - 1, argv + 1); + return bench(argc - 1, argv + 1); } else if (tool == "dump") { - return dump(argc - 1, argv + 1); + return dump(argc - 1, argv + 1); } else if (tool == "permute") { - return permute(argc - 1, argv + 1); + return permute(argc - 1, argv + 1); } else if (tool == "compute-statistics") { - return compute_statistics(argc - 1, argv + 1); + return compute_statistics(argc - 1, argv + 1); } std::cout << "Unsupported tool '" << tool << "'.\n" << std::endl; return help(argv[0]); diff --git a/test/test_alphabet.cpp b/test/test_alphabet.cpp index 3cc131e..69fcd26 100644 --- a/test/test_alphabet.cpp +++ b/test/test_alphabet.cpp @@ -35,24 +35,24 @@ int main(int /*argc*/, char** argv) { << std::endl; } std::cout << "read: '" << std::string(read.data() + i, k) << "'; "; - default_kmer_t x = util::string_to_uint_kmer(read.data() + i, k); + kmer_t x = util::string_to_uint_kmer(read.data() + i, k); std::string kmer = util::uint_kmer_to_string(x, k); std::cout << "capitalized: '" << kmer << "'" << std::endl; } /****/ - expect(util::char_to_uint('A'), kmer_t(0)); - expect(util::char_to_uint('a'), kmer_t(0)); + expect(kmer_t::char_to_uint('A'), uint64_t(0)); + expect(kmer_t::char_to_uint('a'), uint64_t(0)); - expect(util::char_to_uint('C'), kmer_t(1)); - expect(util::char_to_uint('c'), kmer_t(1)); + expect(kmer_t::char_to_uint('C'), uint64_t(1)); + expect(kmer_t::char_to_uint('c'), uint64_t(1)); - expect(util::char_to_uint('T'), kmer_t(2)); - expect(util::char_to_uint('t'), kmer_t(2)); + expect(kmer_t::char_to_uint('T'), uint64_t(2)); + expect(kmer_t::char_to_uint('t'), uint64_t(2)); - expect(util::char_to_uint('G'), kmer_t(3)); - expect(util::char_to_uint('g'), kmer_t(3)); + expect(kmer_t::char_to_uint('G'), uint64_t(3)); + expect(kmer_t::char_to_uint('g'), uint64_t(3)); /****/ From a68c92e359719793ca5bb7dd8bc013b14597122b Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 16 Jan 2024 17:39:22 +0100 Subject: [PATCH 04/21] Change default Int to uint128_t --- include/bit_vector_iterator.hpp | 21 ++++------- include/dump.impl | 2 +- include/hash_util.hpp | 39 ++++++++------------ include/kmer.hpp | 63 +++++++++++++------------------- include/minimizer_enumerator.hpp | 6 +-- include/util.hpp | 7 +--- test/test_alphabet.cpp | 6 ++- 7 files changed, 57 insertions(+), 87 deletions(-) diff --git a/include/bit_vector_iterator.hpp b/include/bit_vector_iterator.hpp index bbe24a3..104356f 100644 --- a/include/bit_vector_iterator.hpp +++ b/include/bit_vector_iterator.hpp @@ -36,7 +36,7 @@ struct bit_vector_iterator { inline void eat(uint64_t l) { assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); - if (l != kmer_t::uint_kmer_bits) m_buf.drop(l); + m_buf.drop(l); m_avail -= l; m_pos += l; } @@ -44,7 +44,7 @@ struct bit_vector_iterator { inline void eat_reverse(uint64_t l) { assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf_reverse(); - if (l != kmer_t::uint_kmer_bits) m_buf.pad(l); + m_buf.pad(l); m_avail -= l; m_pos -= l; } @@ -53,10 +53,8 @@ struct bit_vector_iterator { assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); kmer_t val = m_buf; - if (l != kmer_t::uint_kmer_bits) { - val.take(l); - m_buf.drop_char(); - } + val.take(l); + m_buf.drop_char(); m_avail -= 2; m_pos += 2; return val; @@ -64,22 +62,17 @@ struct bit_vector_iterator { inline uint64_t get_next_char() { if (m_avail < kmer_t::bits_per_char) fill_buf(); - kmer_t val = m_buf; - val.take_char(); - m_buf.drop_char(); m_avail -= 2; m_pos += 2; - return uint64_t(val); + return m_buf.pop_char(); } inline kmer_t take(uint64_t l) { assert(l <= kmer_t::uint_kmer_bits); if (m_avail < l) fill_buf(); kmer_t val = m_buf; - if (l != kmer_t::uint_kmer_bits) { - val.take(l); - m_buf.drop(l); - } + val.take(l); + m_buf.drop(l); m_avail -= l; m_pos += l; return val; diff --git a/include/dump.impl b/include/dump.impl index cc9ce06..a51973d 100644 --- a/include/dump.impl +++ b/include/dump.impl @@ -54,7 +54,7 @@ void dictionary::dump(std::string const& filename) const { } else { kmer_t last_char = kmer; last_char.drop_chars(m_k - 1); - out << kmer_t::uint64_to_char(uint64_t(last_char)); + out << kmer_t::uint64_to_char(last_char.pop_char()); } } prev_minimizer = minimizer; diff --git a/include/hash_util.hpp b/include/hash_util.hpp index 9607f83..10b81a9 100644 --- a/include/hash_util.hpp +++ b/include/hash_util.hpp @@ -11,17 +11,13 @@ struct kmers_pthash_hasher_64 { /* specialization for kmer_t */ static inline pthash::hash64 hash(kmer_t x, uint64_t seed) { - if constexpr (kmer_t::uint_kmer_bits == 64) { - return pthash::MurmurHash2_64(reinterpret_cast(&x), sizeof(x), seed); - } else { - assert(kmer_t::uint_kmer_bits == 128); - uint64_t low = static_cast(x); - uint64_t high = static_cast(x >> 64); - uint64_t hash = - pthash::MurmurHash2_64(reinterpret_cast(&low), sizeof(low), seed) ^ - pthash::MurmurHash2_64(reinterpret_cast(&high), sizeof(high), ~seed); - return hash; + uint64_t hash = 0; + for (int i = 0; i < kmer_t::uint_kmer_bits; i += 64) { + uint64_t block = x.pop64(); + hash ^= pthash::MurmurHash2_64(reinterpret_cast(&block), sizeof(block), + seed + i); } + return hash; } }; @@ -31,21 +27,16 @@ struct kmers_pthash_hasher_128 { /* specialization for kmer_t */ static inline pthash::hash128 hash(kmer_t x, uint64_t seed) { - if constexpr (kmer_t::uint_kmer_bits == 64) { - return {pthash::MurmurHash2_64(reinterpret_cast(&x), sizeof(x), seed), - pthash::MurmurHash2_64(reinterpret_cast(&x), sizeof(x), ~seed)}; - } else { - assert(kmer_t::uint_kmer_bits == 128); - uint64_t low = static_cast(x); - uint64_t high = static_cast(x >> 64); - return { - pthash::MurmurHash2_64(reinterpret_cast(&low), sizeof(low), seed) ^ - pthash::MurmurHash2_64(reinterpret_cast(&high), sizeof(high), - ~seed), - pthash::MurmurHash2_64(reinterpret_cast(&low), sizeof(low), seed + 1) ^ - pthash::MurmurHash2_64(reinterpret_cast(&high), sizeof(high), - ~(seed + 1))}; + uint64_t hash_first = 0; + uint64_t hash_second = 0; + for (int i = 0; i < kmer_t::uint_kmer_bits; i += 64) { + uint64_t block = x.pop64(); + hash_first ^= pthash::MurmurHash2_64(reinterpret_cast(&block), + sizeof(block), seed + i); + hash_second ^= pthash::MurmurHash2_64(reinterpret_cast(&block), + sizeof(block), ~seed + i); } + return {hash_first, hash_second}; } }; diff --git a/include/kmer.hpp b/include/kmer.hpp index 4271801..c4007fc 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -10,63 +10,58 @@ struct uint_kmer_t { uint_kmer_t() {} uint_kmer_t(Int kmer) : kmer(kmer) {} - explicit operator Int() const { return kmer; } + // Only if you're sure the result is uint64_t + // e.g. with minimizers + explicit operator uint64_t() const { + assert(uint64_t(kmer) == kmer); + return kmer; + } // TODO: change to <=> when switching to C++20 bool operator==(uint_kmer_t const& t) const { return kmer == t.kmer; } bool operator!=(uint_kmer_t const& t) const { return kmer != t.kmer; } bool operator<(uint_kmer_t const& t) const { return kmer < t.kmer; } - // append b bits void pad(uint16_t b) { kmer <<= b; } - - // append zero character void pad_char() { kmer <<= bits_per_char; } - // remove first b bits void drop(uint16_t b) { kmer >>= b; } - - // remove first 64 bits void drop64() { drop(64); } - - // remove first encoded character void drop_char() { drop(bits_per_char); } - // remove first k encoded characters void drop_chars(uint16_t k) { drop(k * bits_per_char); } - // remove everything except first b bits void take(uint16_t b) { kmer &= (Int(1) << b) - 1; } - // remove everything except first 64 bits - void take64() { take(64); } - - // remove everything except first character + void take64() { kmer = uint64_t(kmer); } void take_char() { take(bits_per_char); } - - // remove everything except first m character void take_chars(uint16_t k) { take(k * bits_per_char); } - // add a block of b bits in the front + uint64_t pop64() { + uint64_t res(kmer); + drop64(); + return res; + } + uint64_t pop_char() { + uint64_t res(kmer); + res &= (uint64_t(1) << bits_per_char) - 1; + drop_char(); + return res; + } + void append_bits(uint64_t n, uint16_t b) { pad(b); kmer |= n; } - - // add a block of 64 bits in the front void append64(uint64_t n) { append_bits(n, 64); } - - // add a single character in the front void append_char(uint64_t c) { append_bits(c, bits_per_char); } // assigns a character at k-th position - // assumes that the position is empty + // assuming that the position is empty // use append_char instead, if possible void add_kth_char(uint16_t k, uint64_t c) { kmer |= Int(c) << (k * bits_per_char); } - // total number of bits used to store a k-mer static constexpr uint16_t uint_kmer_bits = sizeof(Int) * 8; - // number of bits dedicated to a single character static constexpr uint8_t bits_per_char = BitsPerChar; - // max odd size that can be packed into uint_kmer_bits bits + // max *odd* size that can be packed into uint_kmer_bits bits static constexpr uint16_t max_k = []() { uint16_t max_k_any = uint_kmer_bits / bits_per_char; return max_k_any % 2 == 0 ? max_k_any - 1 : max_k_any; @@ -90,9 +85,9 @@ struct alpha_kmer_t : uint_kmer_t { constexpr char dna_alphabet[] = "ACTG"; -template -struct dna_uint_kmer_t : alpha_kmer_t { - using base = alpha_kmer_t; +template +struct dna_uint_kmer_t : alpha_kmer_t { + using base = alpha_kmer_t; using base::uint_kmer_bits; using base::base; /* @@ -120,15 +115,9 @@ struct dna_uint_kmer_t : alpha_kmer_t { dna_uint_kmer_t x(*this); assert(k <= max_k); dna_uint_kmer_t res(0); - for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { - auto block = x; - block.take64(); - res.append64(crc64(uint64_t(block))); - x.drop64(); - } + for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { res.append64(crc64(x.pop64())); } // res is full reverse-complement to x res.drop(uint_kmer_bits - k); - res.take_chars(k); return res; } @@ -147,6 +136,6 @@ struct dna_uint_kmer_t : alpha_kmer_t { static uint64_t char_to_uint(char c) { return (c >> 1) & 3; } }; -using default_kmer_t = dna_uint_kmer_t; +using default_kmer_t = dna_uint_kmer_t<__uint128_t>; } // namespace sshash diff --git a/include/minimizer_enumerator.hpp b/include/minimizer_enumerator.hpp index a6819e4..e002a27 100644 --- a/include/minimizer_enumerator.hpp +++ b/include/minimizer_enumerator.hpp @@ -81,7 +81,7 @@ struct minimizer_enumerator { } eat(uint64_t(mmer)); } - return uint64_t(m_q.front().value); + return m_q.front().value; } private: @@ -93,10 +93,10 @@ struct minimizer_enumerator { struct mmer_t { mmer_t() {} - mmer_t(uint64_t hash, uint64_t position, kmer_t value) + mmer_t(uint64_t hash, uint64_t position, uint64_t value) : hash(hash), position(position), value(value) {} uint64_t hash, position; - kmer_t value; + uint64_t value; }; /* NOTE: we could use a std::deque here, diff --git a/include/util.hpp b/include/util.hpp index 5f44e43..1e9fc39 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -132,12 +132,7 @@ template template static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) { assert(k <= kmer_t::max_k); - for (uint64_t i = 0; i != k; ++i) { - auto first_char = x; - first_char.take_char(); - str[i] = kmer_t::uint64_to_char(uint64_t(first_char)); - x.drop_char(); - } + for (uint64_t i = 0; i != k; ++i) { str[i] = kmer_t::uint64_to_char(x.pop_char()); } } template diff --git a/test/test_alphabet.cpp b/test/test_alphabet.cpp index 69fcd26..12d7069 100644 --- a/test/test_alphabet.cpp +++ b/test/test_alphabet.cpp @@ -1,4 +1,5 @@ #include "../include/util.hpp" +#include using namespace sshash; @@ -34,10 +35,11 @@ int main(int /*argc*/, char** argv) { std::cerr << "ERROR: '" << std::string(read.data() + i, k) << "' is valid!" << std::endl; } - std::cout << "read: '" << std::string(read.data() + i, k) << "'; "; kmer_t x = util::string_to_uint_kmer(read.data() + i, k); std::string kmer = util::uint_kmer_to_string(x, k); - std::cout << "capitalized: '" << kmer << "'" << std::endl; + std::transform(kmer.begin(), kmer.end(), kmer.begin(), + [](unsigned char c) { return std::tolower(c); }); + expect(std::string(read.data() + i, k), kmer); } /****/ From 970667bfef6f2cfe8aefa8b730b7d837d8ee1ccb Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 16 Jan 2024 21:16:18 +0100 Subject: [PATCH 05/21] Fixes --- include/bit_vector_iterator.hpp | 10 +++++----- include/builder/parse_file.hpp | 11 +++++------ include/dictionary.impl | 4 ++-- include/kmer.hpp | 6 ++++-- include/util.hpp | 4 ++-- src/check_utils.hpp | 16 ++++++++-------- 6 files changed, 26 insertions(+), 25 deletions(-) diff --git a/include/bit_vector_iterator.hpp b/include/bit_vector_iterator.hpp index 104356f..77accd6 100644 --- a/include/bit_vector_iterator.hpp +++ b/include/bit_vector_iterator.hpp @@ -55,15 +55,15 @@ struct bit_vector_iterator { kmer_t val = m_buf; val.take(l); m_buf.drop_char(); - m_avail -= 2; - m_pos += 2; + m_avail -= kmer_t::bits_per_char; + m_pos += kmer_t::bits_per_char; return val; } inline uint64_t get_next_char() { if (m_avail < kmer_t::bits_per_char) fill_buf(); - m_avail -= 2; - m_pos += 2; + m_avail -= kmer_t::bits_per_char; + m_pos += kmer_t::bits_per_char; return m_buf.pop_char(); } @@ -84,7 +84,7 @@ struct bit_vector_iterator { inline void fill_buf() { static_assert(kmer_t::uint_kmer_bits % 64 == 0); for (int i = kmer_t::uint_kmer_bits - 64; i >= 0; i -= 64) { - m_buf.append64(m_bv->get_word64(m_pos + i)); + if (m_pos + i < m_bv->size()) { m_buf.append64(m_bv->get_word64(m_pos + i)); } } m_avail = kmer_t::uint_kmer_bits; } diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index 02431a0..a5f45c4 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -35,7 +35,7 @@ void parse_file(std::istream& is, parse_data& data, typename compact_string_pool::builder builder(k); std::string sequence; - kmer_t prev_minimizer = constants::invalid_uint64; + uint64_t prev_minimizer = constants::invalid_uint64; uint64_t begin = 0; // begin of parsed super_kmer in sequence uint64_t end = 0; // end of parsed super_kmer in sequence @@ -44,8 +44,7 @@ void parse_file(std::istream& is, parse_data& data, bool glue = false; auto append_super_kmer = [&]() { - if (sequence.empty() or prev_minimizer == kmer_t(constants::invalid_uint64) or - begin == end) { + if (sequence.empty() or prev_minimizer == constants::invalid_uint64 or begin == end) { return; } @@ -167,15 +166,15 @@ void parse_file(std::istream& is, parse_data& data, char const* kmer = sequence.data() + end; assert(util::is_valid(kmer, k)); kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k); - auto minimizer = util::compute_minimizer(uint_kmer, k, m, seed); + uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed); if (build_config.canonical_parsing) { kmer_t uint_kmer_rc = uint_kmer.reverse_complement(k); - auto minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); + uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); minimizer = std::min(minimizer, minimizer_rc); } - if (prev_minimizer == kmer_t(constants::invalid_uint64)) prev_minimizer = minimizer; + if (prev_minimizer == constants::invalid_uint64) prev_minimizer = minimizer; if (minimizer != prev_minimizer) { append_super_kmer(); begin = end; diff --git a/include/dictionary.impl b/include/dictionary.impl index 8ad4bfc..1a362e0 100644 --- a/include/dictionary.impl +++ b/include/dictionary.impl @@ -119,7 +119,7 @@ template void dictionary::forward_neighbours(kmer_t suffix, neighbourhood& res) const { for (size_t i = 0; i < kmer_t::alphabet_size; i++) { kmer_t new_kmer = suffix; - new_kmer.add_kth_char(m_k - 1, kmer_t::alphabet[i]); + new_kmer.add_kth_char(m_k - 1, kmer_t::char_to_uint(kmer_t::alphabet[i])); res.forward[i] = lookup_advanced_uint(new_kmer); } } @@ -127,7 +127,7 @@ template void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res) const { for (size_t i = 0; i < kmer_t::alphabet_size; i++) { kmer_t new_kmer = prefix; - new_kmer.add_kth_char(0, kmer_t::alphabet[i]); + new_kmer.add_kth_char(0, kmer_t::char_to_uint(kmer_t::alphabet[i])); res.backward[i] = lookup_advanced_uint(new_kmer); } } diff --git a/include/kmer.hpp b/include/kmer.hpp index c4007fc..860aa7c 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -89,6 +89,8 @@ template struct dna_uint_kmer_t : alpha_kmer_t { using base = alpha_kmer_t; using base::uint_kmer_bits; + using base::bits_per_char; + using base::max_k; using base::base; /* This works with the map: @@ -117,7 +119,7 @@ struct dna_uint_kmer_t : alpha_kmer_t { dna_uint_kmer_t res(0); for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { res.append64(crc64(x.pop64())); } // res is full reverse-complement to x - res.drop(uint_kmer_bits - k); + res.drop(uint_kmer_bits - k * bits_per_char); return res; } @@ -136,6 +138,6 @@ struct dna_uint_kmer_t : alpha_kmer_t { static uint64_t char_to_uint(char c) { return (c >> 1) & 3; } }; -using default_kmer_t = dna_uint_kmer_t<__uint128_t>; +using default_kmer_t = dna_uint_kmer_t; } // namespace sshash diff --git a/include/util.hpp b/include/util.hpp index 1e9fc39..6c9a008 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -209,7 +209,7 @@ static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c] } template -kmer_t compute_minimizer(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { +uint64_t compute_minimizer(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { assert(m <= constants::max_m); assert(m <= k); uint64_t min_hash = uint64_t(-1); @@ -224,7 +224,7 @@ kmer_t compute_minimizer(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { } kmer.drop_char(); } - return minimizer; + return uint64_t(minimizer); } /* used in dump.cpp */ diff --git a/src/check_utils.hpp b/src/check_utils.hpp index 807369f..5c3b28b 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -196,16 +196,16 @@ bool check_correctness_navigational_kmer_query(std::istream& is, dictionary curr = dict.kmer_neighbours(line.data() + i); - - char next_nuc = line[i + k]; - bool next_nuc_not_found = - curr.forward[kmer_t::char_to_uint(next_nuc)].kmer_id == constants::invalid_uint64; - if (next_nuc_not_found) { - std::cout << "expected forward[" << next_nuc << "]" << std::endl; + if (i + k < line.size()) { + char next_nuc = line[i + k]; + bool next_nuc_not_found = curr.forward[kmer_t::char_to_uint(next_nuc)].kmer_id == + constants::invalid_uint64; + if (next_nuc_not_found) { + std::cout << "expected forward[" << next_nuc << "]" << std::endl; + } + assert(!next_nuc_not_found); } - assert(!next_nuc_not_found); if (i != 0) { char prev_nuc = line[i - 1]; From d7c023ea27f6da87ab5745f1ebacb8a5edd42f92 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Tue, 16 Jan 2024 21:37:19 +0100 Subject: [PATCH 06/21] Newlines --- include/dictionary.hpp | 2 +- include/dictionary.impl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/dictionary.hpp b/include/dictionary.hpp index e1d44ae..00058ac 100644 --- a/include/dictionary.hpp +++ b/include/dictionary.hpp @@ -138,4 +138,4 @@ struct dictionary { #include "dictionary.impl" #include "dump.impl" #include "info.impl" -#include "statistics.impl" \ No newline at end of file +#include "statistics.impl" diff --git a/include/dictionary.impl b/include/dictionary.impl index 1a362e0..762f81b 100644 --- a/include/dictionary.impl +++ b/include/dictionary.impl @@ -204,4 +204,4 @@ uint64_t dictionary::num_bits() const { m_weights.num_bits(); } -} // namespace sshash \ No newline at end of file +} // namespace sshash From ff5a8483c36b7fa7ced0deb57dbbc3d28d1b0394 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 7 Feb 2024 22:31:02 +0100 Subject: [PATCH 07/21] slight refactor, implement bitpack.hpp --- include/bitpack.hpp | 70 +++++++++++++++++++ include/buckets.hpp | 2 +- include/dictionary.impl | 4 +- include/kmer.hpp | 70 ++++++++++--------- .../streaming_query_canonical_parsing.hpp | 2 +- .../query/streaming_query_regular_parsing.hpp | 2 +- 6 files changed, 112 insertions(+), 38 deletions(-) create mode 100644 include/bitpack.hpp diff --git a/include/bitpack.hpp b/include/bitpack.hpp new file mode 100644 index 0000000..b61900c --- /dev/null +++ b/include/bitpack.hpp @@ -0,0 +1,70 @@ +#pragma once + +namespace sshash { +// full binary tree of given height +// with Int type in its leafs +template +struct bitpack { + static_assert(height > 0); + using halfpack = std::conditional_t>; + static constexpr uint16_t hsize = 8 * sizeof(halfpack); + halfpack a, b; + + bitpack() {} + bitpack(uint64_t x) : a(x), b(0) {} + bitpack(halfpack a, halfpack b) : a(a), b(b) {} + explicit operator uint64_t() const { return (uint64_t)a; } + + bool operator==(bitpack const& t) const { return std::pair{a, b} == std::pair{t.a, t.b}; } + bool operator!=(bitpack const& t) const { return std::pair{a, b} != std::pair{t.a, t.b}; } + bool operator<(bitpack const& t) const { return std::pair{a, b} < std::pair{t.a, t.b}; } + + // shift in [0, size) + bitpack& operator>>=(uint16_t shift) { + if (shift < hsize) { + a = (a >> shift) | (b << (hsize - shift)); + b >>= shift; + } else { + a = b >> (shift - hsize); + b = 0; + } + return *this; + } + bitpack& operator<<=(uint16_t shift) { + if (shift < hsize) { + b = (b << shift) | (a >> (hsize - shift)); + a <<= shift; + } else { + b = a << (shift - hsize); + a = 0; + } + return *this; + } + bitpack operator<<(uint16_t shift) const { return bitpack(*this) <<= shift; } + bitpack operator>>(uint16_t shift) const { return bitpack(*this) >>= shift; } + + bitpack& operator|=(bitpack const& t) { + a |= t.a; + b |= t.b; + return *this; + } + bitpack& operator&=(bitpack const& t) { + a &= t.a; + b &= t.b; + return *this; + } + bitpack operator|(bitpack const& t) const { return bitpack(*this) |= t; } + bitpack operator&(bitpack const& t) const { return bitpack(*this) &= t; } + + static bitpack pref(uint16_t b) { + uint16_t h = std::min(b, hsize); + if constexpr (height > 1) { + return {halfpack::pref(h), halfpack::pref(b - h)}; + } else if (b < hsize) { + return {(Int(1) << b) - 1, Int(0)}; + } else { + return {Int(-1), (Int(1) << (b - hsize)) - 1}; + } + } +}; +} // namespace sshash diff --git a/include/buckets.hpp b/include/buckets.hpp index 2584af8..6b38d8a 100644 --- a/include/buckets.hpp +++ b/include/buckets.hpp @@ -205,7 +205,7 @@ struct buckets { clear = false; read_kmer.drop_char(); last_char = bv_it.get_next_char(); - read_kmer.add_kth_char(m_k - 1, last_char); + read_kmer.kth_char_or(m_k - 1, last_char); ++m_kmer_id; ++offset; return ret; diff --git a/include/dictionary.impl b/include/dictionary.impl index 762f81b..e704779 100644 --- a/include/dictionary.impl +++ b/include/dictionary.impl @@ -119,7 +119,7 @@ template void dictionary::forward_neighbours(kmer_t suffix, neighbourhood& res) const { for (size_t i = 0; i < kmer_t::alphabet_size; i++) { kmer_t new_kmer = suffix; - new_kmer.add_kth_char(m_k - 1, kmer_t::char_to_uint(kmer_t::alphabet[i])); + new_kmer.kth_char_or(m_k - 1, kmer_t::char_to_uint(kmer_t::alphabet[i])); res.forward[i] = lookup_advanced_uint(new_kmer); } } @@ -127,7 +127,7 @@ template void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res) const { for (size_t i = 0; i < kmer_t::alphabet_size; i++) { kmer_t new_kmer = prefix; - new_kmer.add_kth_char(0, kmer_t::char_to_uint(kmer_t::alphabet[i])); + new_kmer.kth_char_or(0, kmer_t::char_to_uint(kmer_t::alphabet[i])); res.backward[i] = lookup_advanced_uint(new_kmer); } } diff --git a/include/kmer.hpp b/include/kmer.hpp index 860aa7c..cad250a 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -1,21 +1,28 @@ #pragma once -namespace sshash { +#include "bitpack.hpp" -template +namespace sshash { +namespace impl { +template +auto pref(uint16_t b) -> std::enable_if_t, Kmer> { + return (Kmer(1) << b) - 1; +} +template +auto pref(uint16_t b) -> std::enable_if_t, Kmer> { + return Kmer::pref(b); +} +} // namespace impl + +template struct uint_kmer_t { - using uint_t = Int; - Int kmer; + using uint_t = Kmer; + Kmer kmer; uint_kmer_t() {} - uint_kmer_t(Int kmer) : kmer(kmer) {} + uint_kmer_t(uint64_t kmer) : kmer(kmer) {} - // Only if you're sure the result is uint64_t - // e.g. with minimizers - explicit operator uint64_t() const { - assert(uint64_t(kmer) == kmer); - return kmer; - } + explicit operator uint64_t() const { return kmer; } // TODO: change to <=> when switching to C++20 bool operator==(uint_kmer_t const& t) const { return kmer == t.kmer; } @@ -23,43 +30,39 @@ struct uint_kmer_t { bool operator<(uint_kmer_t const& t) const { return kmer < t.kmer; } void pad(uint16_t b) { kmer <<= b; } - void pad_char() { kmer <<= bits_per_char; } + void pad_char() { pad(bits_per_char); } void drop(uint16_t b) { kmer >>= b; } void drop64() { drop(64); } void drop_char() { drop(bits_per_char); } void drop_chars(uint16_t k) { drop(k * bits_per_char); } - void take(uint16_t b) { kmer &= (Int(1) << b) - 1; } - void take64() { kmer = uint64_t(kmer); } + void take(uint16_t b) { kmer &= impl::pref(b); } + void take64() { kmer = static_cast(kmer); } void take_char() { take(bits_per_char); } void take_chars(uint16_t k) { take(k * bits_per_char); } uint64_t pop64() { - uint64_t res(kmer); + uint64_t res = kmer; drop64(); return res; } uint64_t pop_char() { - uint64_t res(kmer); + uint64_t res = kmer; res &= (uint64_t(1) << bits_per_char) - 1; drop_char(); return res; } - void append_bits(uint64_t n, uint16_t b) { - pad(b); - kmer |= n; - } - void append64(uint64_t n) { append_bits(n, 64); } - void append_char(uint64_t c) { append_bits(c, bits_per_char); } + void append(uint16_t b, uint64_t n) { kmer = (kmer << b) | Kmer(n); } + void append64(uint64_t n) { append(64, n); } + void append_char(uint64_t c) { append(bits_per_char, c); } // assigns a character at k-th position // assuming that the position is empty - // use append_char instead, if possible - void add_kth_char(uint16_t k, uint64_t c) { kmer |= Int(c) << (k * bits_per_char); } + void kth_char_or(uint16_t k, uint64_t c) { kmer |= Kmer(c) << (k * bits_per_char); } - static constexpr uint16_t uint_kmer_bits = sizeof(Int) * 8; + static constexpr uint16_t uint_kmer_bits = 8 * sizeof(Kmer); static constexpr uint8_t bits_per_char = BitsPerChar; // max *odd* size that can be packed into uint_kmer_bits bits static constexpr uint16_t max_k = []() { @@ -67,13 +70,13 @@ struct uint_kmer_t { return max_k_any % 2 == 0 ? max_k_any - 1 : max_k_any; }(); - static_assert(uint_kmer_bits % 64 == 0, "Int must use 64*k bits"); - static_assert(bits_per_char <= 64, "At most 64 bits per character"); + static_assert(uint_kmer_bits % 64 == 0, "Kmer must use 64*k bits"); + static_assert(bits_per_char < 64, "Less than 64 bits per character"); }; -template -struct alpha_kmer_t : uint_kmer_t { - using base = uint_kmer_t; +template +struct alpha_kmer_t : uint_kmer_t { + using base = uint_kmer_t; using base::base; static constexpr char const* alphabet = Alphabet; static constexpr uint8_t alphabet_size = strlen(Alphabet); @@ -85,9 +88,9 @@ struct alpha_kmer_t : uint_kmer_t { constexpr char dna_alphabet[] = "ACTG"; -template -struct dna_uint_kmer_t : alpha_kmer_t { - using base = alpha_kmer_t; +template +struct dna_uint_kmer_t : alpha_kmer_t { + using base = alpha_kmer_t; using base::uint_kmer_bits; using base::bits_per_char; using base::max_k; @@ -138,6 +141,7 @@ struct dna_uint_kmer_t : alpha_kmer_t { static uint64_t char_to_uint(char c) { return (c >> 1) & 3; } }; +// also consider __uint128_t, bitpack<__uint128_t, 1> using default_kmer_t = dna_uint_kmer_t; } // namespace sshash diff --git a/include/query/streaming_query_canonical_parsing.hpp b/include/query/streaming_query_canonical_parsing.hpp index 7897a40..e812e69 100644 --- a/include/query/streaming_query_canonical_parsing.hpp +++ b/include/query/streaming_query_canonical_parsing.hpp @@ -55,7 +55,7 @@ struct streaming_query_canonical_parsing { /* 2. compute kmer and minimizer */ if (!m_start) { m_kmer.drop_char(); - m_kmer.add_kth_char(m_shift, kmer_t::char_to_uint(kmer[m_k - 1])); + m_kmer.kth_char_or(m_shift, kmer_t::char_to_uint(kmer[m_k - 1])); assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); } else { m_kmer = util::string_to_uint_kmer(kmer, m_k); diff --git a/include/query/streaming_query_regular_parsing.hpp b/include/query/streaming_query_regular_parsing.hpp index 42527a9..360468b 100644 --- a/include/query/streaming_query_regular_parsing.hpp +++ b/include/query/streaming_query_regular_parsing.hpp @@ -59,7 +59,7 @@ struct streaming_query_regular_parsing { /* 2. compute kmer and minimizer */ if (!m_start) { m_kmer.drop_char(); - m_kmer.add_kth_char(m_shift, kmer_t::char_to_uint(kmer[m_k - 1])); + m_kmer.kth_char_or(m_shift, kmer_t::char_to_uint(kmer[m_k - 1])); assert(m_kmer == util::string_to_uint_kmer(kmer, m_k)); } else { m_kmer = util::string_to_uint_kmer(kmer, m_k); From a395b944afb5501e00f4aeaad5244cc6051803ef Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 7 Feb 2024 22:35:10 +0100 Subject: [PATCH 08/21] static_cast in operator uint64_t --- include/kmer.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/kmer.hpp b/include/kmer.hpp index cad250a..8f5d4f6 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -22,7 +22,7 @@ struct uint_kmer_t { uint_kmer_t() {} uint_kmer_t(uint64_t kmer) : kmer(kmer) {} - explicit operator uint64_t() const { return kmer; } + explicit operator uint64_t() const { return static_cast(kmer); } // TODO: change to <=> when switching to C++20 bool operator==(uint_kmer_t const& t) const { return kmer == t.kmer; } @@ -142,6 +142,6 @@ struct dna_uint_kmer_t : alpha_kmer_t { }; // also consider __uint128_t, bitpack<__uint128_t, 1> -using default_kmer_t = dna_uint_kmer_t; +using default_kmer_t = dna_uint_kmer_t>; } // namespace sshash From 27c4c31e9cb521f4b857ddc1b8cea7f420f14955 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Thu, 8 Feb 2024 14:15:39 +0100 Subject: [PATCH 09/21] refactor, use ~(~0< 1) { - return {halfpack::pref(h), halfpack::pref(b - h)}; - } else if (b < hsize) { - return {(Int(1) << b) - 1, Int(0)}; - } else { - return {Int(-1), (Int(1) << (b - hsize)) - 1}; - } - } + bitpack operator~() const { return {~a, ~b}; } }; } // namespace sshash diff --git a/include/kmer.hpp b/include/kmer.hpp index 8f5d4f6..9de8879 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -1,19 +1,14 @@ #pragma once #include "bitpack.hpp" +#include -namespace sshash { -namespace impl { -template -auto pref(uint16_t b) -> std::enable_if_t, Kmer> { - return (Kmer(1) << b) - 1; -} -template -auto pref(uint16_t b) -> std::enable_if_t, Kmer> { - return Kmer::pref(b); +template +bool operator<(std::bitset const& a, std::bitset const& b) { + return a.to_string() < b.to_string(); } -} // namespace impl +namespace sshash { template struct uint_kmer_t { using uint_t = Kmer; @@ -22,7 +17,13 @@ struct uint_kmer_t { uint_kmer_t() {} uint_kmer_t(uint64_t kmer) : kmer(kmer) {} - explicit operator uint64_t() const { return static_cast(kmer); } + explicit operator uint64_t() const { + if constexpr (std::is_constructible_v) { + return static_cast(kmer); + } else { // std::bitset? + return (kmer & Kmer(uint64_t(-1))).to_ulong(); + } + } // TODO: change to <=> when switching to C++20 bool operator==(uint_kmer_t const& t) const { return kmer == t.kmer; } @@ -37,18 +38,18 @@ struct uint_kmer_t { void drop_char() { drop(bits_per_char); } void drop_chars(uint16_t k) { drop(k * bits_per_char); } - void take(uint16_t b) { kmer &= impl::pref(b); } - void take64() { kmer = static_cast(kmer); } + void take(uint16_t b) { kmer &= ~(~Kmer(0) << b); } + void take64() { kmer &= Kmer(uint64_t(-1)); } void take_char() { take(bits_per_char); } void take_chars(uint16_t k) { take(k * bits_per_char); } uint64_t pop64() { - uint64_t res = kmer; + uint64_t res = *this; drop64(); return res; } uint64_t pop_char() { - uint64_t res = kmer; + uint64_t res = *this; res &= (uint64_t(1) << bits_per_char) - 1; drop_char(); return res; @@ -141,7 +142,7 @@ struct dna_uint_kmer_t : alpha_kmer_t { static uint64_t char_to_uint(char c) { return (c >> 1) & 3; } }; -// also consider __uint128_t, bitpack<__uint128_t, 1> -using default_kmer_t = dna_uint_kmer_t>; +// also supports __uint128_t, bitpack<__uint128_t, 1>, std::bitset<256>, etc +using default_kmer_t = dna_uint_kmer_t; } // namespace sshash From f71fd794c50262c6ac9af552e42a7c0c175d14b4 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 27 Mar 2024 14:26:07 +0100 Subject: [PATCH 10/21] fix compilation errors --- external/pthash | 2 +- include/buckets.hpp | 2 +- include/kmer.hpp | 6 +++--- src/check_utils.hpp | 3 ++- test/test_alphabet.cpp | 44 +++++++++++++++++++++--------------------- 5 files changed, 29 insertions(+), 28 deletions(-) diff --git a/external/pthash b/external/pthash index a4ab139..28aedcb 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit a4ab139b9afd8e975b003d4ac5f3270de8ca378e +Subproject commit 28aedcb03af096c0e9988b1ad240da7b4cf010d7 diff --git a/include/buckets.hpp b/include/buckets.hpp index 2de91e1..44d92bd 100644 --- a/include/buckets.hpp +++ b/include/buckets.hpp @@ -215,7 +215,7 @@ struct buckets { m_clear = false; m_read_kmer.drop_char(); m_last_char = m_bv_it.get_next_char(); - read_kmer.kth_char_or(m_k - 1, m_last_char); + m_read_kmer.kth_char_or(m_k - 1, m_last_char); ++m_begin_kmer_id; ++m_offset; return m_ret; diff --git a/include/kmer.hpp b/include/kmer.hpp index 4ee8e49..c18e2f7 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -93,8 +93,8 @@ constexpr char nucleotides[] = "ACTG"; #endif template -struct dna_uint_kmer_t : alpha_kmer_t { - using base = alpha_kmer_t; +struct dna_uint_kmer_t : alpha_kmer_t { + using base = alpha_kmer_t; using base::uint_kmer_bits; using base::bits_per_char; using base::max_k; @@ -149,7 +149,7 @@ struct dna_uint_kmer_t : alpha_kmer_t { g 103 01100111 -> 10 t 116 01110100 -> 11 */ - static inline uint64_t char_to_uint(char c) { return (((c >> 1) ^ (c >> 2)) & 3); } + static uint64_t char_to_uint(char c) { return (((c >> 1) ^ (c >> 2)) & 3); } #else /* char decimal binary diff --git a/src/check_utils.hpp b/src/check_utils.hpp index fc8a1b4..e0ff20e 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -434,7 +434,8 @@ bool check_correctness_kmer_iterator(dictionary const& dict) { return true; } -bool check_correctness_contig_iterator(dictionary const& dict) { +template +bool check_correctness_contig_iterator(dictionary const& dict) { std::cout << "checking correctness of contig iterator..." << std::endl; std::string expected_kmer(dict.k(), 0); for (uint64_t contig_id = 0; contig_id != dict.num_contigs(); ++contig_id) { diff --git a/test/test_alphabet.cpp b/test/test_alphabet.cpp index e11d288..c4b0882 100644 --- a/test/test_alphabet.cpp +++ b/test/test_alphabet.cpp @@ -25,8 +25,8 @@ int main(int argc, char** argv) { return 1; } uint64_t k = std::stoull(argv[1]); - if (k > constants::max_k) { - std::cerr << "k must be less <= " << constants::max_k << " but got k = " << k << '\n'; + if (k > kmer_t::max_k) { + std::cerr << "k must be less <= " << kmer_t::max_k << " but got k = " << k << '\n'; return 1; } std::string read( @@ -54,23 +54,23 @@ int main(int argc, char** argv) { /****/ #ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING - expect(kmer_t::char_to_int('A'), kmer_t(0)); - expect(kmer_t::char_to_int('a'), kmer_t(0)); - expect(kmer_t::char_to_int('C'), kmer_t(1)); - expect(kmer_t::char_to_int('c'), kmer_t(1)); - expect(kmer_t::char_to_int('G'), kmer_t(2)); - expect(kmer_t::char_to_int('g'), kmer_t(2)); - expect(kmer_t::char_to_int('T'), kmer_t(3)); - expect(kmer_t::char_to_int('t'), kmer_t(3)); + expect(kmer_t::char_to_uint('A'), 0); + expect(kmer_t::char_to_uint('a'), 0); + expect(kmer_t::char_to_uint('C'), 1); + expect(kmer_t::char_to_uint('c'), 1); + expect(kmer_t::char_to_uint('G'), 2); + expect(kmer_t::char_to_uint('g'), 2); + expect(kmer_t::char_to_uint('T'), 3); + expect(kmer_t::char_to_uint('t'), 3); #else - expect(kmer_t::char_to_int('A'), kmer_t(0)); - expect(kmer_t::char_to_int('a'), kmer_t(0)); - expect(kmer_t::char_to_int('C'), kmer_t(1)); - expect(kmer_t::char_to_int('c'), kmer_t(1)); - expect(kmer_t::char_to_int('T'), kmer_t(2)); - expect(kmer_t::char_to_int('t'), kmer_t(2)); - expect(kmer_t::char_to_int('G'), kmer_t(3)); - expect(kmer_t::char_to_int('g'), kmer_t(3)); + expect(kmer_t::char_to_uint('A'), 0); + expect(kmer_t::char_to_uint('a'), 0); + expect(kmer_t::char_to_uint('C'), 1); + expect(kmer_t::char_to_uint('c'), 1); + expect(kmer_t::char_to_uint('T'), 2); + expect(kmer_t::char_to_uint('t'), 2); + expect(kmer_t::char_to_uint('G'), 3); + expect(kmer_t::char_to_uint('g'), 3); #endif for (uint64_t kmer_len = 1; kmer_len <= k; ++kmer_len) { @@ -80,10 +80,10 @@ int main(int argc, char** argv) { for (uint64_t i = 0; i != 1000; ++i) { // generate a random kmer of length kmer_len random_kmer(kmer.data(), kmer_len); - kmer_t::compute_reverse_complement(kmer.data(), rc.data(), kmer_len); - kmer_t uint_kmer = kmer_t::string_to_uint_kmer(kmer.data(), kmer_len); - uint_kmer = kmer_t::compute_reverse_complement(uint_kmer, kmer_len); - expect(kmer_t::uint_kmer_to_string(uint_kmer, kmer_len), rc); + util::compute_reverse_complement(kmer.data(), rc.data(), kmer_len); + kmer_t uint_kmer = util::string_to_uint_kmer(kmer.data(), kmer_len); + uint_kmer = uint_kmer.reverse_complement(kmer_len); + expect(util::uint_kmer_to_string(uint_kmer, kmer_len), rc); } } From be0a9552bf9d86f1f33e575039b062edfb6d0240 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 27 Mar 2024 14:49:21 +0100 Subject: [PATCH 11/21] use kmer_t::bits_per_char instead of const 2 --- include/buckets.hpp | 20 ++++++++++---------- include/builder/build_skew_index.hpp | 6 +++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/include/buckets.hpp b/include/buckets.hpp index 44d92bd..a4e4768 100644 --- a/include/buckets.hpp +++ b/include/buckets.hpp @@ -57,14 +57,14 @@ struct buckets { kmer_t contig_prefix(uint64_t contig_id, uint64_t k) const { uint64_t contig_begin = pieces.access(contig_id); - bit_vector_iterator bv_it(strings, 2 * contig_begin); - return bv_it.read(2 * (k - 1)); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * contig_begin); + return bv_it.read(kmer_t::bits_per_char * (k - 1)); } kmer_t contig_suffix(uint64_t contig_id, uint64_t k) const { uint64_t contig_end = pieces.access(contig_id + 1); - bit_vector_iterator bv_it(strings, 2 * (contig_end - k + 1)); - return bv_it.read(2 * (k - 1)); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * (contig_end - k + 1)); + return bv_it.read(kmer_t::bits_per_char * (k - 1)); } std::pair locate_bucket(uint64_t bucket_id) const { @@ -95,7 +95,7 @@ struct buckets { uint64_t m) const { uint64_t offset = offsets.access(super_kmer_id); auto [res, contig_end] = offset_to_id(offset, k); - bit_vector_iterator bv_it(strings, 2 * offset); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * offset); uint64_t window_size = std::min(k - m + 1, contig_end - offset - k + 1); for (uint64_t w = 0; w != window_size; ++w) { kmer_t read_kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * k); @@ -120,7 +120,7 @@ struct buckets { for (uint64_t super_kmer_id = begin; super_kmer_id != end; ++super_kmer_id) { uint64_t offset = offsets.access(super_kmer_id); auto [res, contig_end] = offset_to_id(offset, k); - bit_vector_iterator bv_it(strings, 2 * offset); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * offset); uint64_t window_size = std::min(k - m + 1, contig_end - offset - k + 1); for (uint64_t w = 0; w != window_size; ++w) { kmer_t read_kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * k); @@ -171,8 +171,8 @@ struct buckets { void access(uint64_t kmer_id, char* string_kmer, uint64_t k) const { uint64_t offset = id_to_offset(kmer_id, k); - bit_vector_iterator bv_it(strings, 2 * offset); - kmer_t read_kmer = bv_it.read(2 * k); + bit_vector_iterator bv_it(strings, kmer_t::bits_per_char * offset); + kmer_t read_kmer = bv_it.read(kmer_t::bits_per_char * k); util::uint_kmer_to_string(read_kmer, string_kmer, k); } @@ -239,10 +239,10 @@ struct buckets { bool m_clear; void next_piece() { - m_bv_it.at(2 * m_offset); + m_bv_it.at(kmer_t::bits_per_char * m_offset); m_next_offset = m_pieces_it.next(); assert(m_next_offset > m_offset); - m_read_kmer = m_bv_it.take(2 * m_k); + m_read_kmer = m_bv_it.take(kmer_t::bits_per_char * m_k); m_clear = true; } }; diff --git a/include/builder/build_skew_index.hpp b/include/builder/build_skew_index.hpp index 38753a2..f9e6600 100644 --- a/include/builder/build_skew_index.hpp +++ b/include/builder/build_skew_index.hpp @@ -216,12 +216,12 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data assert(lists[i].size() > lower and lists[i].size() <= upper); uint64_t super_kmer_id = 0; for (auto [offset, num_kmers_in_super_kmer] : lists[i]) { - bit_vector_iterator bv_it(m_buckets.strings, 2 * offset); + bit_vector_iterator bv_it(m_buckets.strings, kmer_t::bits_per_char * offset); for (uint64_t i = 0; i != num_kmers_in_super_kmer; ++i) { - kmer_t kmer = bv_it.read(2 * build_config.k); + kmer_t kmer = bv_it.read(kmer_t::bits_per_char * build_config.k); keys_in_partition.push_back(kmer); super_kmer_ids_in_partition.push_back(super_kmer_id); - bv_it.eat(2); + bv_it.eat(kmer_t::bits_per_char); } assert(super_kmer_id < (1ULL << cvb_positions.width())); ++super_kmer_id; From 04aef53aecb5828addff513adb888acbb21369df Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 27 Mar 2024 22:46:14 +0100 Subject: [PATCH 12/21] fix clang compile errors + clang-format --- include/builder/build_skew_index.hpp | 3 ++- include/kmer.hpp | 7 ++++--- src/check_utils.hpp | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/include/builder/build_skew_index.hpp b/include/builder/build_skew_index.hpp index f9e6600..0faa749 100644 --- a/include/builder/build_skew_index.hpp +++ b/include/builder/build_skew_index.hpp @@ -216,7 +216,8 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data assert(lists[i].size() > lower and lists[i].size() <= upper); uint64_t super_kmer_id = 0; for (auto [offset, num_kmers_in_super_kmer] : lists[i]) { - bit_vector_iterator bv_it(m_buckets.strings, kmer_t::bits_per_char * offset); + bit_vector_iterator bv_it(m_buckets.strings, + kmer_t::bits_per_char * offset); for (uint64_t i = 0; i != num_kmers_in_super_kmer; ++i) { kmer_t kmer = bv_it.read(kmer_t::bits_per_char * build_config.k); keys_in_partition.push_back(kmer); diff --git a/include/kmer.hpp b/include/kmer.hpp index c18e2f7..c81062f 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -2,6 +2,7 @@ #include "bitpack.hpp" #include +#include template bool operator<(std::bitset const& a, std::bitset const& b) { @@ -44,12 +45,12 @@ struct uint_kmer_t { void take_chars(uint16_t k) { take(k * bits_per_char); } uint64_t pop64() { - uint64_t res = *this; + uint64_t res(*this); drop64(); return res; } uint64_t pop_char() { - uint64_t res = *this; + uint64_t res(*this); res &= (uint64_t(1) << bits_per_char) - 1; drop_char(); return res; @@ -80,7 +81,7 @@ struct alpha_kmer_t : uint_kmer_t { using base = uint_kmer_t; using base::base; static constexpr char const* alphabet = Alphabet; - static constexpr uint8_t alphabet_size = strlen(Alphabet); + static constexpr uint8_t alphabet_size = std::char_traits::length(Alphabet); static uint64_t char_to_uint(char c); static char uint64_to_char(uint64_t x) { return alphabet[x]; } diff --git a/src/check_utils.hpp b/src/check_utils.hpp index e0ff20e..a4d9d10 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -434,7 +434,7 @@ bool check_correctness_kmer_iterator(dictionary const& dict) { return true; } -template +template bool check_correctness_contig_iterator(dictionary const& dict) { std::cout << "checking correctness of contig iterator..." << std::endl; std::string expected_kmer(dict.k(), 0); From d44152e0b9ac8f3d85c670d4e1f12b6b1a764a30 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 27 Mar 2024 23:44:04 +0100 Subject: [PATCH 13/21] include for struct neighbourhood --- include/util.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/util.hpp b/include/util.hpp index e3c49ad..18e5a57 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include #include // for std::ceil on linux @@ -34,7 +34,6 @@ struct lookup_result { template struct neighbourhood { - /* forward */ std::array forward; std::array backward; }; From 07424de9200ef3b2eceb41d56c94329f20d8a342 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Wed, 17 Apr 2024 01:28:14 +0200 Subject: [PATCH 14/21] fix UB due to shift by 64 --- include/kmer.hpp | 32 ++++++++++++++++++++++++++------ 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/include/kmer.hpp b/include/kmer.hpp index c81062f..9567ff5 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -31,11 +31,23 @@ struct uint_kmer_t { bool operator!=(uint_kmer_t const& t) const { return kmer != t.kmer; } bool operator<(uint_kmer_t const& t) const { return kmer < t.kmer; } - void pad(uint16_t b) { kmer <<= b; } + void pad(uint16_t b) { + assert(b < uint_kmer_bits); + kmer <<= b; + } void pad_char() { pad(bits_per_char); } - void drop(uint16_t b) { kmer >>= b; } - void drop64() { drop(64); } + void drop(uint16_t b) { + assert(b < uint_kmer_bits); + kmer >>= b; + } + void drop64() { + if constexpr (uint_kmer_bits == 64) { + kmer = 0; + } else { + drop(64); + } + } void drop_char() { drop(bits_per_char); } void drop_chars(uint16_t k) { drop(k * bits_per_char); } @@ -56,8 +68,17 @@ struct uint_kmer_t { return res; } - void append(uint16_t b, uint64_t n) { kmer = (kmer << b) | Kmer(n); } - void append64(uint64_t n) { append(64, n); } + void append(uint16_t b, uint64_t n) { + assert(b < uint_kmer_bits); + kmer = (kmer << b) | Kmer(n); + } + void append64(uint64_t n) { + if constexpr (uint_kmer_bits == 64) { + kmer = n; + } else { + append(64, n); + } + } void append_char(uint64_t c) { append(bits_per_char, c); } // assigns a character at k-th position @@ -123,7 +144,6 @@ struct dna_uint_kmer_t : alpha_kmer_t { const uint64_t c2 = 0x3333333333333333; res = ((res & c1) << 4) | ((res & (c1 << 4)) >> 4); // swap 2-nuc order in bytes res = ((res & c2) << 2) | ((res & (c2 << 2)) >> 2); // swap nuc order in 2-nuc - return res; } From 33a30b6b4f9c4fa18b812ab773b05f0e2fa013aa Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 27 May 2024 10:51:58 +0200 Subject: [PATCH 15/21] fix warning when printing uint128_t --- test/test_alphabet.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_alphabet.cpp b/test/test_alphabet.cpp index c4b0882..de76b96 100644 --- a/test/test_alphabet.cpp +++ b/test/test_alphabet.cpp @@ -5,8 +5,8 @@ using namespace sshash; std::ostream& operator<<(std::ostream& os, __uint128_t x) { - os << *(reinterpret_cast(&x) + 0); - os << *(reinterpret_cast(&x) + 1); + const uint64_t *xp = reinterpret_cast(&x); + os << xp[0] << xp[1]; return os; } @@ -116,4 +116,4 @@ int main(int argc, char** argv) { expect(util::canonicalize_basepair_reverse_map[int('g')], 'c'); return 0; -} \ No newline at end of file +} From 5cd0d65765b82686d9976e91cddef1b7f89855d9 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 27 May 2024 11:11:09 +0200 Subject: [PATCH 16/21] fix subobject-linkage warnings --- include/kmer.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/kmer.hpp b/include/kmer.hpp index 9567ff5..1200dc2 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -109,9 +109,9 @@ struct alpha_kmer_t : uint_kmer_t { }; #ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING -constexpr char nucleotides[] = "ACGT"; +inline constexpr char nucleotides[] = "ACGT"; #else -constexpr char nucleotides[] = "ACTG"; +inline constexpr char nucleotides[] = "ACTG"; #endif template From 889fe5c1cbef86f877d2413714ea5e899c26f328 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Mon, 27 May 2024 16:28:31 +0200 Subject: [PATCH 17/21] Update test_alphabet.cpp --- test/test_alphabet.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_alphabet.cpp b/test/test_alphabet.cpp index de76b96..701726b 100644 --- a/test/test_alphabet.cpp +++ b/test/test_alphabet.cpp @@ -5,8 +5,7 @@ using namespace sshash; std::ostream& operator<<(std::ostream& os, __uint128_t x) { - const uint64_t *xp = reinterpret_cast(&x); - os << xp[0] << xp[1]; + os << static_cast(x) << static_cast(x >> 64); return os; } From 2a663783e653011a7d4a785cabbc7dd98eb3cef2 Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Thu, 20 Jun 2024 14:08:57 +0200 Subject: [PATCH 18/21] Amino acid alphabet k-mer support (#4) --- include/builder/build.impl | 4 +- include/builder/parse_file.hpp | 7 +- include/constants.hpp | 4 - include/cover/parse_file.hpp | 10 +- include/dictionary.impl | 6 +- include/dump.impl | 3 +- include/kmer.hpp | 119 ++++++++++++++++-- .../streaming_query_canonical_parsing.hpp | 9 +- .../query/streaming_query_regular_parsing.hpp | 6 +- include/statistics.impl | 3 +- include/util.hpp | 64 +--------- src/bench_utils.hpp | 6 +- src/check_utils.hpp | 9 +- src/permute.cpp | 2 +- test/test_alphabet.cpp | 38 +++--- 15 files changed, 168 insertions(+), 122 deletions(-) diff --git a/include/builder/build.impl b/include/builder/build.impl index 07453eb..dab731c 100644 --- a/include/builder/build.impl +++ b/include/builder/build.impl @@ -22,8 +22,8 @@ void dictionary::build(std::string const& filename, " but got k = " + std::to_string(build_config.k)); } if (build_config.m == 0) throw std::runtime_error("m must be > 0"); - if (build_config.m > constants::max_m) { - throw std::runtime_error("m must be less <= " + std::to_string(constants::max_m) + + if (build_config.m > kmer_t::max_m) { + throw std::runtime_error("m must be less <= " + std::to_string(kmer_t::max_m) + " but got m = " + std::to_string(build_config.m)); } if (build_config.m > build_config.k) throw std::runtime_error("m must be < k"); diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp index 1fc31c4..9ba9b95 100644 --- a/include/builder/parse_file.hpp +++ b/include/builder/parse_file.hpp @@ -51,7 +51,7 @@ void parse_file(std::istream& is, parse_data& data, assert(end > begin); char const* super_kmer = sequence.data() + begin; uint64_t size = (end - begin) + k - 1; - assert(util::is_valid(super_kmer, size)); + assert(util::is_valid(super_kmer, size)); /* if num_kmers_in_super_kmer > k - m + 1, then split the super_kmer into blocks */ uint64_t num_kmers_in_super_kmer = end - begin; @@ -164,12 +164,13 @@ void parse_file(std::istream& is, parse_data& data, while (end != sequence.size() - k + 1) { char const* kmer = sequence.data() + end; - assert(util::is_valid(kmer, k)); + assert(util::is_valid(kmer, k)); kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k); uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed); if (build_config.canonical_parsing) { - kmer_t uint_kmer_rc = uint_kmer.reverse_complement(k); + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(k); uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed); minimizer = std::min(minimizer, minimizer_rc); } diff --git a/include/constants.hpp b/include/constants.hpp index 3f04546..aa5ccf2 100644 --- a/include/constants.hpp +++ b/include/constants.hpp @@ -3,10 +3,6 @@ #include "kmer.hpp" namespace sshash::constants { - -/* max *odd* size that can be packed into 64 bits */ -constexpr uint64_t max_m = 31; - constexpr uint64_t invalid_uint64 = uint64_t(-1); constexpr uint64_t seed = 1; diff --git a/include/cover/parse_file.hpp b/include/cover/parse_file.hpp index 2b1c4c2..53581c7 100644 --- a/include/cover/parse_file.hpp +++ b/include/cover/parse_file.hpp @@ -198,6 +198,7 @@ void reverse_header(std::string const& input, std::string& output, uint64_t k) { for (auto weight : weights) output.append(std::to_string(weight) + " "); } +template void permute_and_write(std::istream& is, std::string const& output_filename, std::string const& tmp_dirname, pthash::compact_vector const& permutation, pthash::bit_vector const& signs, uint64_t k) { @@ -255,8 +256,8 @@ void permute_and_write(std::istream& is, std::string const& output_filename, and reverse the weights in header_sequence */ dna_sequence_rc.resize(dna_sequence.size()); header_sequence_r.clear(); - util::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), - dna_sequence.size()); + kmer_t::compute_reverse_complement(dna_sequence.data(), dna_sequence_rc.data(), + dna_sequence.size()); reverse_header(header_sequence, header_sequence_r, k); dna_sequence.swap(dna_sequence_rc); header_sequence.swap(header_sequence_r); @@ -352,6 +353,7 @@ void permute_and_write(std::istream& is, std::string const& output_filename, } } +template void permute_and_write(std::string const& input_filename, std::string const& output_filename, std::string const& tmp_dirname, pthash::compact_vector const& permutation, pthash::bit_vector const& signs, uint64_t k) { @@ -360,9 +362,9 @@ void permute_and_write(std::string const& input_filename, std::string const& out std::cout << "reading file '" << input_filename << "'..." << std::endl; if (util::ends_with(input_filename, ".gz")) { zip_istream zis(is); - permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k); + permute_and_write(zis, output_filename, tmp_dirname, permutation, signs, k); } else { - permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k); + permute_and_write(is, output_filename, tmp_dirname, permutation, signs, k); } is.close(); } diff --git a/include/dictionary.impl b/include/dictionary.impl index d00fa4a..72d41a9 100644 --- a/include/dictionary.impl +++ b/include/dictionary.impl @@ -24,7 +24,8 @@ lookup_result dictionary::lookup_uint_regular_parsing(kmer_t uint_kmer) template lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const { - kmer_t uint_kmer_rc = uint_kmer.reverse_complement(m_k); + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(m_k); auto minimizer = util::compute_minimizer(uint_kmer, m_k, m_m, m_seed); auto minimizer_rc = util::compute_minimizer(uint_kmer_rc, m_k, m_m, m_seed); uint64_t bucket_id = m_minimizers.lookup(uint64_t(std::min(minimizer, minimizer_rc))); @@ -79,7 +80,8 @@ lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer, auto res = lookup_uint_regular_parsing(uint_kmer); assert(res.kmer_orientation == constants::forward_orientation); if (check_reverse_complement and res.kmer_id == constants::invalid_uint64) { - kmer_t uint_kmer_rc = uint_kmer.reverse_complement(m_k); + kmer_t uint_kmer_rc = uint_kmer; + uint_kmer_rc.reverse_complement_inplace(m_k); res = lookup_uint_regular_parsing(uint_kmer_rc); res.kmer_orientation = constants::backward_orientation; } diff --git a/include/dump.impl b/include/dump.impl index a51973d..a2373fa 100644 --- a/include/dump.impl +++ b/include/dump.impl @@ -31,7 +31,8 @@ void dictionary::dump(std::string const& filename) const { kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k); auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); if (m_canonical_parsing) { - kmer_t kmer_rc = kmer.reverse_complement(m_k); + kmer_t kmer_rc = kmer; + kmer_rc.reverse_complement_inplace(m_k); auto [minimizer_rc, pos_rc] = util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed); if (minimizer_rc < minimizer) { diff --git a/include/kmer.hpp b/include/kmer.hpp index 1200dc2..69affbe 100644 --- a/include/kmer.hpp +++ b/include/kmer.hpp @@ -87,14 +87,12 @@ struct uint_kmer_t { static constexpr uint16_t uint_kmer_bits = 8 * sizeof(Kmer); static constexpr uint8_t bits_per_char = BitsPerChar; - // max *odd* size that can be packed into uint_kmer_bits bits - static constexpr uint16_t max_k = []() { - uint16_t max_k_any = uint_kmer_bits / bits_per_char; - return max_k_any % 2 == 0 ? max_k_any - 1 : max_k_any; - }(); static_assert(uint_kmer_bits % 64 == 0, "Kmer must use 64*k bits"); - static_assert(bits_per_char < 64, "Less than 64 bits per character"); + static_assert(bits_per_char < 64, "BitsPerChar must be less than 64"); + + static constexpr uint16_t max_k = uint_kmer_bits / bits_per_char; + static constexpr uint16_t max_m = 64 / bits_per_char; }; template @@ -104,8 +102,16 @@ struct alpha_kmer_t : uint_kmer_t { static constexpr char const* alphabet = Alphabet; static constexpr uint8_t alphabet_size = std::char_traits::length(Alphabet); + static bool is_valid(char c); static uint64_t char_to_uint(char c); static char uint64_to_char(uint64_t x) { return alphabet[x]; } + + // Revcompl only makes sense for DNA, fallback to noop otherwise + [[maybe_unused]] virtual void reverse_complement_inplace(uint64_t) {} + [[maybe_unused]] static void compute_reverse_complement(char const* input, char* output, + uint64_t size) { + for (uint64_t i = 0; i != size; ++i) { output[i] = input[i]; } + } }; #ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING @@ -119,8 +125,10 @@ struct dna_uint_kmer_t : alpha_kmer_t { using base = alpha_kmer_t; using base::uint_kmer_bits; using base::bits_per_char; - using base::max_k; using base::base; + // Use odd lengths for DNA to avoid dealing with self-complements + static constexpr uint16_t max_k = base::max_k - !(base::max_k % 2); + static constexpr uint16_t max_m = base::max_m - !(base::max_m % 2); /* This works with the map: A -> 00; C -> 01; G -> 11; T -> 10. @@ -147,14 +155,13 @@ struct dna_uint_kmer_t : alpha_kmer_t { return res; } - [[maybe_unused]] dna_uint_kmer_t reverse_complement(uint64_t k) { - dna_uint_kmer_t x(*this); + [[maybe_unused]] void reverse_complement_inplace(uint64_t k) override { assert(k <= max_k); - dna_uint_kmer_t res(0); - for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { res.append64(crc64(x.pop64())); } + dna_uint_kmer_t rev(0); + for (uint16_t i = 0; i < uint_kmer_bits; i += 64) { rev.append64(crc64(base::pop64())); } // res is full reverse-complement to x - res.drop(uint_kmer_bits - k * bits_per_char); - return res; + rev.drop(uint_kmer_bits - k * bits_per_char); + *this = rev; } #ifdef SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING @@ -186,6 +193,92 @@ struct dna_uint_kmer_t : alpha_kmer_t { */ static uint64_t char_to_uint(char c) { return (c >> 1) & 3; } #endif + + /* + Forward character map: + A -> A 65 + C -> C 67 + G -> G 71 + T -> T 84 + a -> a 97 + c -> c 99 + g -> g 103 + t -> t 116 + All other chars map to zero. + */ + static constexpr char canonicalize_basepair_forward_map[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 99, 0, 0, 0, 103, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + /* + Reverse character map: + 65 A -> T 84 + 67 C -> G 71 + 71 G -> C 67 + 84 T -> A 65 + 97 a -> t 116 + 99 c -> g 103 + 103 g -> c 99 + 116 t -> a 97 + All other chars map to zero. + */ + static constexpr char canonicalize_basepair_reverse_map[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 103, 0, 0, 0, 99, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + [[maybe_unused]] static void compute_reverse_complement(char const* input, char* output, + uint64_t size) { + for (uint64_t i = 0; i != size; ++i) { + int c = input[i]; + output[size - i - 1] = canonicalize_basepair_reverse_map[c]; + } + } + + static inline bool is_valid(char c) { + return canonicalize_basepair_forward_map[static_cast(c)]; + } +}; + +inline constexpr char amino_acids[] = "ABCDEFGHIJKLMNOPQRSTUVWYZX"; +template +struct aa_uint_kmer_t : alpha_kmer_t { + using base = alpha_kmer_t; + using base::uint_kmer_bits; + using base::bits_per_char; + using base::base; + + static constexpr int8_t char_to_aa[256] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, + 25, 23, 24, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 25, 23, 24, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; + + static bool is_valid(char c) { return ~char_to_aa[static_cast(c)]; } + static uint64_t char_to_uint(char c) { return char_to_aa[static_cast(c)]; } }; // also supports bitpack<__uint128_t, 1>, std::bitset<256>, etc diff --git a/include/query/streaming_query_canonical_parsing.hpp b/include/query/streaming_query_canonical_parsing.hpp index e812e69..a5756e6 100644 --- a/include/query/streaming_query_canonical_parsing.hpp +++ b/include/query/streaming_query_canonical_parsing.hpp @@ -46,7 +46,8 @@ struct streaming_query_canonical_parsing { lookup_result lookup_advanced(const char* kmer) { /* 1. validation */ - bool is_valid = m_start ? util::is_valid(kmer, m_k) : util::is_valid(kmer[m_k - 1]); + bool is_valid = + m_start ? util::is_valid(kmer, m_k) : kmer_t::is_valid(kmer[m_k - 1]); if (!is_valid) { start(); return lookup_result(); @@ -62,7 +63,8 @@ struct streaming_query_canonical_parsing { } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = m_kmer.reverse_complement(m_k); + m_kmer_rc = m_kmer; + m_kmer_rc.reverse_complement_inplace(m_k); constexpr bool reverse = true; uint64_t minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); assert(minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); @@ -174,7 +176,8 @@ struct streaming_query_canonical_parsing { kmer_t val = m_string_iterator.read(2 * m_k); if (check_minimizer and super_kmer_id == begin and m_pos_in_window == 0) { - kmer_t val_rc = val.reverse_complement(m_k); + kmer_t val_rc = val; + val_rc.reverse_complement_inplace(m_k); uint64_t minimizer = std::min(util::compute_minimizer(val, m_k, m_m, m_seed), util::compute_minimizer(val_rc, m_k, m_m, m_seed)); diff --git a/include/query/streaming_query_regular_parsing.hpp b/include/query/streaming_query_regular_parsing.hpp index 360468b..ad58989 100644 --- a/include/query/streaming_query_regular_parsing.hpp +++ b/include/query/streaming_query_regular_parsing.hpp @@ -50,7 +50,8 @@ struct streaming_query_regular_parsing { lookup_result lookup_advanced(const char* kmer) { /* 1. validation */ - bool is_valid = m_start ? util::is_valid(kmer, m_k) : util::is_valid(kmer[m_k - 1]); + bool is_valid = + m_start ? util::is_valid(kmer, m_k) : kmer_t::is_valid(kmer[m_k - 1]); if (!is_valid) { m_start = true; return lookup_result(); @@ -66,7 +67,8 @@ struct streaming_query_regular_parsing { } m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start); assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed)); - m_kmer_rc = m_kmer.reverse_complement(m_k); + m_kmer_rc = m_kmer; + m_kmer_rc.reverse_complement_inplace(m_k); constexpr bool reverse = true; m_curr_minimizer_rc = m_minimizer_enum_rc.next(m_kmer_rc, m_start, reverse); assert(m_curr_minimizer_rc == util::compute_minimizer(m_kmer_rc, m_k, m_m, m_seed)); diff --git a/include/statistics.impl b/include/statistics.impl index e9f59fe..c9fef8e 100644 --- a/include/statistics.impl +++ b/include/statistics.impl @@ -28,7 +28,8 @@ void dictionary::compute_statistics() const { kmer_t kmer = bv_it.read_and_advance_by_char(kmer_t::bits_per_char * m_k); auto [minimizer, pos] = util::compute_minimizer_pos(kmer, m_k, m_m, m_seed); if (m_canonical_parsing) { - kmer_t kmer_rc = kmer.reverse_complement(m_k); + kmer_t kmer_rc = kmer; + kmer_rc.reverse_complement_inplace(m_k); auto [minimizer_rc, pos_rc] = util::compute_minimizer_pos(kmer_rc, m_k, m_m, m_seed); if (minimizer_rc < minimizer) { diff --git a/include/util.hpp b/include/util.hpp index 18e5a57..0667fda 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -148,66 +148,10 @@ template return str; } -/* - Forward character map: - A -> A 65 - C -> C 67 - G -> G 71 - T -> T 84 - a -> a 97 - c -> c 99 - g -> g 103 - t -> t 116 - All other chars map to zero. -*/ -static const char canonicalize_basepair_forward_map[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 99, 0, 0, 0, 103, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - -/* - Reverse character map: - 65 A -> T 84 - 67 C -> G 71 - 71 G -> C 67 - 84 T -> A 65 - 97 a -> t 116 - 99 c -> g 103 - 103 g -> c 99 - 116 t -> a 97 - All other chars map to zero. -*/ -static const char canonicalize_basepair_reverse_map[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 103, 0, 0, 0, 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - -[[maybe_unused]] static void compute_reverse_complement(char const* input, char* output, - uint64_t size) { - for (uint64_t i = 0; i != size; ++i) { - int c = input[i]; - output[size - i - 1] = canonicalize_basepair_reverse_map[c]; - } -} - -static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c]; } - +template [[maybe_unused]] static bool is_valid(char const* str, uint64_t size) { for (uint64_t i = 0; i != size; ++i) { - int c = str[i]; - if (canonicalize_basepair_forward_map[c] == 0) return false; + if (!kmer_t::is_valid(str[i])) { return false; } } return true; } @@ -217,7 +161,7 @@ static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c] */ template uint64_t compute_minimizer(kmer_t kmer, const uint64_t k, const uint64_t m, const uint64_t seed) { - assert(m <= constants::max_m); + assert(m <= kmer_t::max_m); assert(m <= k); uint64_t min_hash = uint64_t(-1); kmer_t minimizer = kmer_t(-1); @@ -238,7 +182,7 @@ uint64_t compute_minimizer(kmer_t kmer, const uint64_t k, const uint64_t m, cons template std::pair compute_minimizer_pos(kmer_t kmer, uint64_t k, uint64_t m, uint64_t seed) { - assert(m <= constants::max_m); + assert(m <= kmer_t::max_m); assert(m <= k); uint64_t min_hash = uint64_t(-1); kmer_t minimizer = kmer_t(-1); diff --git a/src/bench_utils.hpp b/src/bench_utils.hpp index f32dd18..9062b60 100644 --- a/src/bench_utils.hpp +++ b/src/bench_utils.hpp @@ -35,7 +35,7 @@ void perf_test_lookup_access(dictionary const& dict) { dict.access(id, kmer.data()); if ((i & 1) == 0) { /* transform 50% of the kmers into their reverse complements */ - util::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); + kmer_t::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); lookup_queries.push_back(kmer_rc); } else { lookup_queries.push_back(kmer); @@ -82,7 +82,7 @@ void perf_test_lookup_access(dictionary const& dict) { dict.access(id, kmer.data()); if ((i & 1) == 0) { /* transform 50% of the kmers into their reverse complements */ - util::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); + kmer_t::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); lookup_queries.push_back(kmer_rc); } else { lookup_queries.push_back(kmer); @@ -161,7 +161,7 @@ void perf_test_lookup_weight(dictionary const& dict) { dict.access(id, kmer.data()); if ((i & 1) == 0) { /* transform 50% of the kmers into their reverse complements */ - util::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); + kmer_t::compute_reverse_complement(kmer.data(), kmer_rc.data(), k); lookup_queries.push_back(kmer_rc); } else { lookup_queries.push_back(kmer); diff --git a/src/check_utils.hpp b/src/check_utils.hpp index a4d9d10..ab4df99 100644 --- a/src/check_utils.hpp +++ b/src/check_utils.hpp @@ -40,7 +40,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& ++num_lines; for (uint64_t i = 0; i + k <= line.size(); ++i) { - assert(util::is_valid(line.data() + i, k)); + assert(util::is_valid(line.data() + i, k)); kmer_t uint_kmer = util::string_to_uint_kmer(line.data() + i, k); bool orientation = constants::forward_orientation; @@ -51,7 +51,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& /* transform 50% of the kmers into their reverse complements */ if ((num_kmers & 1) == 0) { - uint_kmer = uint_kmer.reverse_complement(k); + uint_kmer.reverse_complement_inplace(k); orientation = constants::backward_orientation; } @@ -140,7 +140,8 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& // check access dict.access(id, got_kmer_str.data()); kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k); - kmer_t got_uint_kmer_rc = got_uint_kmer.reverse_complement(k); + kmer_t got_uint_kmer_rc = got_uint_kmer; + got_uint_kmer_rc.reverse_complement_inplace(k); if (got_uint_kmer != uint_kmer and got_uint_kmer_rc != uint_kmer) { std::cout << "ERROR: got '" << got_kmer_str << "' but expected '" << expected_kmer_str << "'" << std::endl; @@ -192,7 +193,7 @@ bool check_correctness_navigational_kmer_query(std::istream& is, dictionary(line.data() + i, k)); if (num_kmers != 0 and num_kmers % 5000000 == 0) { std::cout << "checked " << num_kmers << " kmers" << std::endl; } diff --git a/src/permute.cpp b/src/permute.cpp index e716be1..3cb0d5c 100644 --- a/src/permute.cpp +++ b/src/permute.cpp @@ -94,7 +94,7 @@ int permute(int argc, char** argv) { } /* permute and save to output file */ - permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k); + permute_and_write(input_filename, output_filename, tmp_dirname, permutation, signs, k); std::remove(permutation_filename.c_str()); return 0; diff --git a/test/test_alphabet.cpp b/test/test_alphabet.cpp index 701726b..788df29 100644 --- a/test/test_alphabet.cpp +++ b/test/test_alphabet.cpp @@ -38,7 +38,7 @@ int main(int argc, char** argv) { "c"); for (uint64_t i = 0; i != read.length() - k + 1; ++i) { - bool is_valid = util::is_valid(read.data() + i, k); + bool is_valid = util::is_valid(read.data() + i, k); if (!is_valid) { std::cerr << "ERROR: '" << std::string(read.data() + i, k) << "' is NOT valid!" << std::endl; @@ -79,40 +79,40 @@ int main(int argc, char** argv) { for (uint64_t i = 0; i != 1000; ++i) { // generate a random kmer of length kmer_len random_kmer(kmer.data(), kmer_len); - util::compute_reverse_complement(kmer.data(), rc.data(), kmer_len); + kmer_t::compute_reverse_complement(kmer.data(), rc.data(), kmer_len); kmer_t uint_kmer = util::string_to_uint_kmer(kmer.data(), kmer_len); - uint_kmer = uint_kmer.reverse_complement(kmer_len); + uint_kmer.reverse_complement_inplace(kmer_len); expect(util::uint_kmer_to_string(uint_kmer, kmer_len), rc); } } /****/ - expect(util::canonicalize_basepair_forward_map[int('A')], 'A'); - expect(util::canonicalize_basepair_forward_map[int('a')], 'a'); + expect(kmer_t::canonicalize_basepair_forward_map[int('A')], 'A'); + expect(kmer_t::canonicalize_basepair_forward_map[int('a')], 'a'); - expect(util::canonicalize_basepair_forward_map[int('C')], 'C'); - expect(util::canonicalize_basepair_forward_map[int('c')], 'c'); + expect(kmer_t::canonicalize_basepair_forward_map[int('C')], 'C'); + expect(kmer_t::canonicalize_basepair_forward_map[int('c')], 'c'); - expect(util::canonicalize_basepair_forward_map[int('T')], 'T'); - expect(util::canonicalize_basepair_forward_map[int('t')], 't'); + expect(kmer_t::canonicalize_basepair_forward_map[int('T')], 'T'); + expect(kmer_t::canonicalize_basepair_forward_map[int('t')], 't'); - expect(util::canonicalize_basepair_forward_map[int('G')], 'G'); - expect(util::canonicalize_basepair_forward_map[int('g')], 'g'); + expect(kmer_t::canonicalize_basepair_forward_map[int('G')], 'G'); + expect(kmer_t::canonicalize_basepair_forward_map[int('g')], 'g'); /****/ - expect(util::canonicalize_basepair_reverse_map[int('A')], 'T'); - expect(util::canonicalize_basepair_reverse_map[int('a')], 't'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('A')], 'T'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('a')], 't'); - expect(util::canonicalize_basepair_reverse_map[int('C')], 'G'); - expect(util::canonicalize_basepair_reverse_map[int('c')], 'g'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('C')], 'G'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('c')], 'g'); - expect(util::canonicalize_basepair_reverse_map[int('T')], 'A'); - expect(util::canonicalize_basepair_reverse_map[int('t')], 'a'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('T')], 'A'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('t')], 'a'); - expect(util::canonicalize_basepair_reverse_map[int('G')], 'C'); - expect(util::canonicalize_basepair_reverse_map[int('g')], 'c'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('G')], 'C'); + expect(kmer_t::canonicalize_basepair_reverse_map[int('g')], 'c'); return 0; } From 29b6c6be3a5697d41e5b30fcbfdf1033ff4269cd Mon Sep 17 00:00:00 2001 From: Harun Mustafa Date: Thu, 20 Jun 2024 14:19:46 +0200 Subject: [PATCH 19/21] Fix warnings and multi-threading issues --- external/pthash | 2 +- include/minimizers.hpp | 8 ++++++-- include/util.hpp | 5 ++++- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/external/pthash b/external/pthash index 28aedcb..09b9bec 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit 28aedcb03af096c0e9988b1ad240da7b4cf010d7 +Subproject commit 09b9bec3a9027b47c131061ec6a485454a698bcc diff --git a/include/minimizers.hpp b/include/minimizers.hpp index f58cb90..937c4db 100644 --- a/include/minimizers.hpp +++ b/include/minimizers.hpp @@ -13,7 +13,7 @@ struct minimizers { mphf_config.seed = util::get_seed_for_hash_function(build_config); mphf_config.minimal_output = true; mphf_config.verbose_output = false; - mphf_config.num_threads = std::thread::hardware_concurrency(); + mphf_config.num_threads = build_config.num_threads; mphf_config.num_partitions = 4 * mphf_config.num_threads; if (size / mphf_config.num_partitions < pthash::constants::min_partition_size) { @@ -29,6 +29,10 @@ struct minimizers { mphf_config.ram = 4 * essentials::GB; mphf_config.tmp_dir = build_config.tmp_dirname; + if (size <= 1) { + throw std::runtime_error( + "each partition must contain more than one key: use less partitions"); + } m_mphf.build_in_external_memory(begin, size, mphf_config); } @@ -49,4 +53,4 @@ struct minimizers { minimizers_pthash_type m_mphf; }; -} // namespace sshash \ No newline at end of file +} // namespace sshash diff --git a/include/util.hpp b/include/util.hpp index 0667fda..3cc1218 100644 --- a/include/util.hpp +++ b/include/util.hpp @@ -3,6 +3,7 @@ #include #include #include +#include #include // for std::ceil on linux #include "hash_util.hpp" @@ -74,6 +75,7 @@ struct build_configuration { : k(31) , m(17) , seed(constants::seed) + , num_threads(std::thread::hardware_concurrency()) , l(constants::min_l) , c(constants::c) @@ -87,6 +89,7 @@ struct build_configuration { uint64_t k; // kmer size uint64_t m; // minimizer size uint64_t seed; + uint64_t num_threads; uint64_t l; // drive dictionary trade-off double c; // drive PTHash trade-off @@ -302,4 +305,4 @@ struct buffered_lines_iterator { uint64_t m_read_chars; }; -} // namespace sshash \ No newline at end of file +} // namespace sshash From f3b64711543919b99fc91fa53d71268bf520a252 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 15 Jul 2024 17:03:39 +0200 Subject: [PATCH 20/21] Update pthash --- external/pthash | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/external/pthash b/external/pthash index 9466359..3042587 160000 --- a/external/pthash +++ b/external/pthash @@ -1 +1 @@ -Subproject commit 94663599ab7d092a9301bf8c8709fbf558a04601 +Subproject commit 3042587a9a8d21032e06be263d1628158967d9dd From 72a92b62da0538e18177ce0494832b315d103844 Mon Sep 17 00:00:00 2001 From: Oleksandr Kulkov Date: Mon, 15 Jul 2024 17:14:12 +0200 Subject: [PATCH 21/21] Make bits_per_kmer_formula static, remove partition size exception --- include/info.impl | 4 ++-- include/minimizers.hpp | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/info.impl b/include/info.impl index 02c3d6c..38347c9 100644 --- a/include/info.impl +++ b/include/info.impl @@ -1,6 +1,6 @@ namespace sshash { -double bits_per_kmer_formula(uint64_t k, /* kmer length */ +static double bits_per_kmer_formula(uint64_t k, /* kmer length */ uint64_t m, /* minimizer length */ uint64_t n, /* num. kmers */ uint64_t M) /* num. strings in SPSS */ @@ -90,4 +90,4 @@ void dictionary::print_info() const { print_space_breakdown(); } -} // namespace sshash \ No newline at end of file +} // namespace sshash diff --git a/include/minimizers.hpp b/include/minimizers.hpp index 61985d6..d9be4ce 100644 --- a/include/minimizers.hpp +++ b/include/minimizers.hpp @@ -29,10 +29,6 @@ struct minimizers { mphf_config.ram = 4 * essentials::GB; mphf_config.tmp_dir = build_config.tmp_dirname; - if (size <= 1) { - throw std::runtime_error( - "each partition must contain more than one key: use less partitions"); - } m_mphf.build_in_external_memory(begin, size, mphf_config); }