diff --git a/src/arguments.hpp b/src/arguments.hpp index 547a55b6..83a961de 100644 --- a/src/arguments.hpp +++ b/src/arguments.hpp @@ -26,10 +26,11 @@ struct SeedingArguments { "results with non default values.", {'s'}} , bits{parser, "INT", "No. of top bits of hash to use as bucket indices (8-31)" "[determined from reference size]", {'b'}} + , digest{parser, "INT", "Number of bits to be selected from the second strobe hash [24]", {"digest"}} { } args::ArgumentParser& parser; - args::ValueFlag r, m, k, l, u, c, s, bits; + args::ValueFlag r, m, k, l, u, c, s, bits, digest; }; #endif diff --git a/src/cmdline.cpp b/src/cmdline.cpp index c00c5479..d3917136 100644 --- a/src/cmdline.cpp +++ b/src/cmdline.cpp @@ -117,6 +117,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) { if (seeding.s) { opt.s = args::get(seeding.s); opt.s_set = true; } if (seeding.c) { opt.c = args::get(seeding.c); opt.c_set = true; } if (seeding.bits) { opt.bits = args::get(seeding.bits); } + if (seeding.digest) { opt.digest = args::get(seeding.digest); } // Alignment // if (n) { n = args::get(n); } diff --git a/src/cmdline.hpp b/src/cmdline.hpp index 2ae3186e..1f3c80eb 100644 --- a/src/cmdline.hpp +++ b/src/cmdline.hpp @@ -45,6 +45,7 @@ struct CommandLineOptions { int u { 7 }; int s { 16 }; int c { 8 }; + int digest {24}; // Alignment int A { 2 }; diff --git a/src/dumpstrobes.cpp b/src/dumpstrobes.cpp index 4a8a1d88..28da3cf9 100644 --- a/src/dumpstrobes.cpp +++ b/src/dumpstrobes.cpp @@ -101,7 +101,7 @@ int run_dumpstrobes(int argc, char **argv) { } // Seeding - int r{150}, k{20}, s{16}, c{8}, l{1}, u{7}; + int r{150}, k{20}, s{16}, c{8}, l{1}, u{7}, digest{8}; int max_seed_len{}; bool k_set{false}, s_set{false}, c_set{false}, max_seed_len_set{false}, l_set{false}, u_set{false}; @@ -125,7 +125,8 @@ int run_dumpstrobes(int argc, char **argv) { l_set ? l : IndexParameters::DEFAULT, u_set ? u : IndexParameters::DEFAULT, c_set ? c : IndexParameters::DEFAULT, - max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT + max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT, + digest ? digest: IndexParameters::DEFAULT ); logger.info() << index_parameters << '\n'; diff --git a/src/index.cpp b/src/index.cpp index 7773e509..e2415a59 100644 --- a/src/index.cpp +++ b/src/index.cpp @@ -149,6 +149,8 @@ void StrobemerIndex::populate(float f, size_t n_threads) { } stats.tot_strobemer_count = total_randstrobes; + logger.info() << "Digest parameter is : " << parameters.randstrobe.digest; + logger.debug() << " Total number of randstrobes: " << total_randstrobes << '\n'; uint64_t memory_bytes = references.total_length() + sizeof(RefRandstrobe) * total_randstrobes + sizeof(bucket_index_t) * (1u << bits); logger.debug() << " Estimated total memory usage: " << memory_bytes / 1E9 << " GB\n"; diff --git a/src/index.hpp b/src/index.hpp index 941db7de..3cf3273b 100644 --- a/src/index.hpp +++ b/src/index.hpp @@ -17,6 +17,7 @@ #include "refs.hpp" #include "randstrobes.hpp" #include "indexparameters.hpp" +#include "logger.hpp" struct IndexCreationStatistics { @@ -38,6 +39,7 @@ struct StrobemerIndex { using bucket_index_t = uint64_t; StrobemerIndex(const References& references, const IndexParameters& parameters, int bits=-1) : filter_cutoff(0) + , partial_filter_cutoff(2000) //fixme initialize somewhere else , parameters(parameters) , references(references) , bits(bits == -1 ? pick_bits(references.total_length()) : bits) @@ -47,6 +49,7 @@ struct StrobemerIndex { } } unsigned int filter_cutoff; + unsigned int partial_filter_cutoff; mutable IndexCreationStatistics stats; void write(const std::string& filename) const; @@ -80,6 +83,37 @@ struct StrobemerIndex { return end(); } + //Returns the first entry that matches the first strobe subhash (if using multi-context seeds) + size_t partial_find(randstrobe_hash_t key) const { + uint strobe2_digest = parameters.randstrobe.digest; + randstrobe_hash_t key_prefix = key >> strobe2_digest; + + constexpr int MAX_LINEAR_SEARCH = 4; + const unsigned int top_N = key >> (64 - bits); + bucket_index_t position_start = randstrobe_start_indices[top_N]; + bucket_index_t position_end = randstrobe_start_indices[top_N + 1]; + if (position_start == position_end) { + return end(); + } + + if (position_end - position_start < MAX_LINEAR_SEARCH) { + for ( ; position_start < position_end; ++position_start) { + if (randstrobes[position_start].hash >> strobe2_digest == key_prefix) return position_start; + if (randstrobes[position_start].hash >> strobe2_digest > key_prefix) return end(); + } + return end(); + } + auto cmp = [&strobe2_digest](const RefRandstrobe lhs, const RefRandstrobe rhs) { + return (lhs.hash >> strobe2_digest) < (rhs.hash >> strobe2_digest); }; + + auto pos = std::lower_bound(randstrobes.begin() + position_start, + randstrobes.begin() + position_end, + RefRandstrobe{key, 0, 0}, + cmp); + if (pos->hash == key) return pos - randstrobes.begin(); + return end(); + } + randstrobe_hash_t get_hash(bucket_index_t position) const { if (position < randstrobes.size()) { return randstrobes[position].hash; @@ -92,6 +126,11 @@ struct StrobemerIndex { return get_hash(position) == get_hash(position + filter_cutoff); } + bool is_partial_filtered(bucket_index_t position) const { + uint shift = parameters.randstrobe.digest; + return (get_hash(position) >> shift) == (get_hash(position + partial_filter_cutoff) >> shift); + } + unsigned int get_strobe1_position(bucket_index_t position) const { return randstrobes[position].position; } diff --git a/src/indexparameters.cpp b/src/indexparameters.cpp index 2f634e9f..7235d4ce 100644 --- a/src/indexparameters.cpp +++ b/src/indexparameters.cpp @@ -18,7 +18,8 @@ bool RandstrobeParameters::operator==(const RandstrobeParameters& other) const { && this->q == other.q && this->max_dist == other.max_dist && this->w_min == other.w_min - && this->w_max == other.w_max; + && this->w_max == other.w_max + && this->digest == other.digest; } /* Pre-defined index parameters that work well for a certain @@ -30,24 +31,25 @@ struct Profile { int s_offset; int l; int u; + int digest; }; static auto max{std::numeric_limits::max()}; static std::vector profiles = { - Profile{ 50, 90, 18, -4, -2, 1}, - Profile{100, 110, 20, -4, -2, 2}, - Profile{125, 135, 20, -4, -1, 4}, - Profile{150, 175, 20, -4, 1, 7}, - Profile{250, 375, 22, -4, 2, 12}, - Profile{400, max, 23, -6, 2, 12}, + Profile{ 50, 90, 18, -4, -2, 1, 24}, + Profile{100, 110, 20, -4, -2, 2, 24}, + Profile{125, 135, 20, -4, -1, 4, 24}, + Profile{150, 175, 20, -4, 1, 7, 24}, + Profile{250, 375, 22, -4, 2, 12, 24}, + Profile{400, max, 23, -6, 2, 12, 24}, }; /* Create an IndexParameters instance based on a given read length. * k, s, l, u, c and max_seed_len can be used to override determined parameters * by setting them to a value other than IndexParameters::DEFAULT. */ -IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len) { +IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len, int digest) { const int default_c = 8; size_t canonical_read_length = 50; for (const auto& p : profiles) { @@ -64,6 +66,9 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, if (u == DEFAULT) { u = p.u; } + if (digest == DEFAULT) { + digest = p.digest; + } canonical_read_length = p.canonical_read_length; break; } @@ -78,7 +83,7 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, } int q = std::pow(2, c == DEFAULT ? default_c : c) - 1; - return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist); + return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, digest); } void IndexParameters::write(std::ostream& os) const { @@ -89,6 +94,7 @@ void IndexParameters::write(std::ostream& os) const { write_int_to_ostream(os, randstrobe.u); write_int_to_ostream(os, randstrobe.q); write_int_to_ostream(os, randstrobe.max_dist); + write_int_to_ostream(os, randstrobe.digest); } IndexParameters IndexParameters::read(std::istream& is) { @@ -99,7 +105,8 @@ IndexParameters IndexParameters::read(std::istream& is) { int u = read_int_from_istream(is); int q = read_int_from_istream(is); int max_dist = read_int_from_istream(is); - return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist); + int digest = read_int_from_istream(is); + return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, digest); } bool IndexParameters::operator==(const IndexParameters& other) const { @@ -119,7 +126,7 @@ std::string IndexParameters::filename_extension() const { // nothing was overridden sstream << ".r" << canonical_read_length; } - sstream << ".sti"; + sstream << ".stimc"; return sstream.str(); } diff --git a/src/indexparameters.hpp b/src/indexparameters.hpp index 75e6c258..db6de72c 100644 --- a/src/indexparameters.hpp +++ b/src/indexparameters.hpp @@ -43,14 +43,16 @@ struct RandstrobeParameters { const int max_dist; const unsigned w_min; const unsigned w_max; + const unsigned digest; - RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max) + RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max, unsigned digest) : l(l) , u(u) , q(q) , max_dist(max_dist) , w_min(w_min) , w_max(w_max) + , digest(digest) { verify(); } @@ -65,6 +67,9 @@ struct RandstrobeParameters { if (w_min > w_max) { throw BadParameter("w_min is greater than w_max (choose different -l/-u parameters)"); } + if (digest > 63) { + throw BadParameter("digest size is larger than 63"); + } } }; @@ -77,15 +82,15 @@ class IndexParameters { static const int DEFAULT = std::numeric_limits::min(); - IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist) + IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist, int digest) : canonical_read_length(canonical_read_length) , syncmer(k, s) - , randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u) + , randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u, digest) { } static IndexParameters from_read_length( - int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT + int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT, int digest = DEFAULT ); static IndexParameters read(std::istream& os); std::string filename_extension() const; diff --git a/src/main.cpp b/src/main.cpp index dcc96774..6da5763e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -167,7 +167,8 @@ int run_strobealign(int argc, char **argv) { opt.l_set ? opt.l : IndexParameters::DEFAULT, opt.u_set ? opt.u : IndexParameters::DEFAULT, opt.c_set ? opt.c : IndexParameters::DEFAULT, - opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT + opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT, + opt.digest ? opt.digest : IndexParameters::DEFAULT ); logger.debug() << index_parameters << '\n'; AlignmentParameters aln_params; diff --git a/src/nam.cpp b/src/nam.cpp index 2dfbccdb..37e1cf76 100644 --- a/src/nam.cpp +++ b/src/nam.cpp @@ -1,4 +1,5 @@ #include "nam.hpp" +static Logger& logger = Logger::get(); namespace { @@ -14,12 +15,14 @@ inline void add_to_hits_per_ref( int query_start, int query_end, const StrobemerIndex& index, - size_t position + size_t position, + bool is_partial ) { int min_diff = std::numeric_limits::max(); for (const auto hash = index.get_hash(position); index.get_hash(position) == hash; ++position) { int ref_start = index.get_strobe1_position(position); - int ref_end = ref_start + index.strobe2_offset(position) + index.k(); + uint strobe2_position = is_partial ? 0 : index.strobe2_offset(position); + int ref_end = ref_start + strobe2_position + index.k(); int diff = std::abs((query_end - query_start) - (ref_end - ref_start)); if (diff <= min_diff) { hits_per_ref[index.reference_index(position)].push_back(Hit{query_start, query_end, ref_start, ref_end}); @@ -171,7 +174,17 @@ std::pair> find_nams( continue; } nr_good_hits++; - add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, position); + add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, position, false); + } else { + size_t partial_pos = index.partial_find(q.hash); + if (partial_pos != index.end()) { + total_hits++; + if (index.is_partial_filtered(position)) { + continue; + } + nr_good_hits++; + add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, partial_pos, true); + } } } float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0; @@ -231,12 +244,14 @@ std::vector find_nams_rescue( if ((rh.count > rescue_cutoff && cnt >= 5) || rh.count > 1000) { break; } - add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position); + add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position, false); cnt++; } is_revcomp++; } + + return merge_hits_into_nams_forward_and_reverse(hits_per_ref, index.k(), true); } diff --git a/src/randstrobes.cpp b/src/randstrobes.cpp index 3cfb5a0a..47f092d7 100644 --- a/src/randstrobes.cpp +++ b/src/randstrobes.cpp @@ -49,6 +49,12 @@ static inline randstrobe_hash_t randstrobe_hash(syncmer_hash_t hash1, syncmer_ha return hash1 + hash2; } +static inline digest_hash_t digest_hash(syncmer_hash_t hash1, syncmer_hash_t hash2, size_t digest_size) { + syncmer_hash_t main_hash = std::min(hash1, hash2); + syncmer_hash_t aux_hash = std::max(hash1, hash2); + return ((main_hash >> digest_size) << digest_size) ^ (aux_hash >> (64 - digest_size)); +} + std::ostream& operator<<(std::ostream& os, const Syncmer& syncmer) { os << "Syncmer(hash=" << syncmer.hash << ", position=" << syncmer.position << ")"; return os; @@ -167,7 +173,10 @@ Randstrobe RandstrobeIterator::get(unsigned int strobe1_index) const { } } - return Randstrobe{randstrobe_hash(strobe1.hash, strobe2.hash), static_cast(strobe1.position), static_cast(strobe2.position)}; +// return Randstrobe{randstrobe_hash(strobe1.hash, strobe2.hash), static_cast(strobe1.position), static_cast(strobe2.position)}; + return Randstrobe{digest_hash(strobe1.hash, strobe2.hash, digest), + static_cast(strobe1.position), + static_cast(strobe2.position)}; } Randstrobe RandstrobeGenerator::next() { @@ -198,7 +207,10 @@ Randstrobe RandstrobeGenerator::next() { } } syncmers.pop_front(); - return Randstrobe{randstrobe_hash(strobe1.hash, strobe2.hash), static_cast(strobe1.position), static_cast(strobe2.position)}; +// return Randstrobe{randstrobe_hash(strobe1.hash, strobe2.hash), static_cast(strobe1.position), static_cast(strobe2.position)}; + return Randstrobe{digest_hash(strobe1.hash, strobe2.hash, digest), + static_cast(strobe1.position), + static_cast(strobe2.position)}; } /* diff --git a/src/randstrobes.hpp b/src/randstrobes.hpp index 7a117b3c..9ff77dad 100644 --- a/src/randstrobes.hpp +++ b/src/randstrobes.hpp @@ -16,6 +16,7 @@ using syncmer_hash_t = uint64_t; using randstrobe_hash_t = uint64_t; +using digest_hash_t = uint64_t; struct RefRandstrobe { using packed_t = uint32_t; @@ -97,6 +98,7 @@ class RandstrobeIterator { , w_max(parameters.w_max) , q(parameters.q) , max_dist(parameters.max_dist) + , digest(parameters.digest) { if (w_min > w_max) { throw std::invalid_argument("w_min is greater than w_max"); @@ -118,6 +120,7 @@ class RandstrobeIterator { const unsigned w_max; const uint64_t q; const unsigned int max_dist; + uint digest; unsigned int strobe1_index = 0; }; @@ -166,6 +169,7 @@ class RandstrobeGenerator { , w_max(randstrobe_parameters.w_max) , q(randstrobe_parameters.q) , max_dist(randstrobe_parameters.max_dist) + , digest(randstrobe_parameters.digest) { } Randstrobe next(); @@ -177,6 +181,7 @@ class RandstrobeGenerator { const unsigned w_max; const uint64_t q; const unsigned int max_dist; + uint digest; std::deque syncmers; };