Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add multi-context seeds #388

Closed
wants to merge 13 commits into from
Closed
3 changes: 2 additions & 1 deletion src/arguments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ struct SeedingArguments {
"results with non default values.", {'s'}}
, bits{parser, "INT", "No. of top bits of hash to use as bucket indices (8-31)"
"[determined from reference size]", {'b'}}
, aux_len{parser, "INT", "Number of bits to be selected from the second strobe hash [24]", {"aux-len"}}
{
}
args::ArgumentParser& parser;
args::ValueFlag<int> r, m, k, l, u, c, s, bits;
args::ValueFlag<int> r, m, k, l, u, c, s, bits, aux_len;
};

#endif
1 change: 1 addition & 0 deletions src/cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
if (seeding.s) { opt.s = args::get(seeding.s); opt.s_set = true; }
if (seeding.c) { opt.c = args::get(seeding.c); opt.c_set = true; }
if (seeding.bits) { opt.bits = args::get(seeding.bits); }
if (seeding.aux_len) { opt.aux_len = args::get(seeding.aux_len); }

// Alignment
// if (n) { n = args::get(n); }
Expand Down
1 change: 1 addition & 0 deletions src/cmdline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct CommandLineOptions {
int u { 7 };
int s { 16 };
int c { 8 };
int aux_len{24};

// Alignment
int A { 2 };
Expand Down
5 changes: 3 additions & 2 deletions src/dumpstrobes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ int run_dumpstrobes(int argc, char **argv) {
}

// Seeding
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7};
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7}, aux_len{24};
int max_seed_len{};

bool k_set{false}, s_set{false}, c_set{false}, max_seed_len_set{false}, l_set{false}, u_set{false};
Expand All @@ -125,7 +125,8 @@ int run_dumpstrobes(int argc, char **argv) {
l_set ? l : IndexParameters::DEFAULT,
u_set ? u : IndexParameters::DEFAULT,
c_set ? c : IndexParameters::DEFAULT,
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT,
aux_len ? aux_len : IndexParameters::DEFAULT
);

logger.info() << index_parameters << '\n';
Expand Down
5 changes: 4 additions & 1 deletion src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ void StrobemerIndex::populate(float f, unsigned n_threads) {
}
stats.tot_strobemer_count = total_randstrobes;

logger.info() << "Auxiliary hash length is : " << parameters.randstrobe.aux_len;
Itolstoganov marked this conversation as resolved.
Show resolved Hide resolved

logger.debug() << " Total number of randstrobes: " << total_randstrobes << '\n';
uint64_t memory_bytes = references.total_length() + sizeof(RefRandstrobe) * total_randstrobes + sizeof(bucket_index_t) * (1u << bits);
logger.debug() << " Estimated total memory usage: " << memory_bytes / 1E9 << " GB\n";
Expand Down Expand Up @@ -241,6 +243,7 @@ void StrobemerIndex::populate(float f, unsigned n_threads) {
filter_cutoff = 30;
}
stats.filter_cutoff = filter_cutoff;
partial_filter_cutoff = 100;
stats.elapsed_hash_index = hash_index_timer.duration();
stats.distinct_strobemers = unique_mers;
}
Expand Down Expand Up @@ -304,7 +307,7 @@ void StrobemerIndex::assign_randstrobes(size_t ref_index, size_t offset) {
chunk.push_back(randstrobe);
}
for (auto randstrobe : chunk) {
RefRandstrobe::packed_t packed = ref_index << 8;
RefRandstrobe::packed_t packed = (ref_index << 9) | (randstrobe.main_is_first << 8);
packed = packed + (randstrobe.strobe2_pos - randstrobe.strobe1_pos);
randstrobes[offset++] = RefRandstrobe{randstrobe.hash, randstrobe.strobe1_pos, packed};
}
Expand Down
51 changes: 51 additions & 0 deletions src/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "refs.hpp"
#include "randstrobes.hpp"
#include "indexparameters.hpp"
#include "logger.hpp"


struct IndexCreationStatistics {
Expand All @@ -38,6 +39,7 @@ struct StrobemerIndex {
using bucket_index_t = uint64_t;
StrobemerIndex(const References& references, const IndexParameters& parameters, int bits=-1)
: filter_cutoff(0)
, partial_filter_cutoff(0)
, parameters(parameters)
, references(references)
, bits(bits == -1 ? pick_bits(references.total_length()) : bits)
Expand All @@ -47,6 +49,7 @@ struct StrobemerIndex {
}
}
unsigned int filter_cutoff;
unsigned int partial_filter_cutoff;
mutable IndexCreationStatistics stats;

void write(const std::string& filename) const;
Expand Down Expand Up @@ -80,18 +83,66 @@ struct StrobemerIndex {
return end();
}

//Returns the first entry that matches the main hash
size_t partial_find(randstrobe_hash_t key) const {
const unsigned int aux_len = parameters.randstrobe.aux_len;
randstrobe_hash_t key_prefix = key >> aux_len;

constexpr int MAX_LINEAR_SEARCH = 4;
const unsigned int top_N = key >> (64 - bits);
bucket_index_t position_start = randstrobe_start_indices[top_N];
bucket_index_t position_end = randstrobe_start_indices[top_N + 1];
if (position_start == position_end) {
return end();
}

if (position_end - position_start < MAX_LINEAR_SEARCH) {
for ( ; position_start < position_end; ++position_start) {
if (randstrobes[position_start].hash >> aux_len == key_prefix) return position_start;
if (randstrobes[position_start].hash >> aux_len > key_prefix) return end();
}
return end();
}
auto cmp = [&aux_len](const RefRandstrobe lhs, const RefRandstrobe rhs) {
return (lhs.hash >> aux_len) < (rhs.hash >> aux_len); };

auto pos = std::lower_bound(randstrobes.begin() + position_start,
randstrobes.begin() + position_end,
RefRandstrobe{key, 0, 0},
cmp);
if (pos->hash == key) return pos - randstrobes.begin();
return end();
}

randstrobe_hash_t get_hash(bucket_index_t position) const {
if (position < randstrobes.size()) {
return randstrobes[position].hash;
} else {
return end();
}
}

randstrobe_hash_t get_partial_hash(bucket_index_t position) const {
if (position < randstrobes.size()) {
return randstrobes[position].hash >> parameters.randstrobe.aux_len;
} else {
return end();
}
}

bool first_strobe_is_main(bucket_index_t position) const {
return randstrobes[position].main_is_first();
}

bool is_filtered(bucket_index_t position) const {
return get_hash(position) == get_hash(position + filter_cutoff);
}

bool is_partial_filtered(bucket_index_t position) const {
const unsigned int shift = parameters.randstrobe.aux_len;
return (get_hash(position) >> shift) == (get_hash(position + partial_filter_cutoff) >> shift);
}

unsigned int get_strobe1_position(bucket_index_t position) const {
return randstrobes[position].position;
}
Expand Down
14 changes: 10 additions & 4 deletions src/indexparameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ bool RandstrobeParameters::operator==(const RandstrobeParameters& other) const {
&& this->q == other.q
&& this->max_dist == other.max_dist
&& this->w_min == other.w_min
&& this->w_max == other.w_max;
&& this->w_max == other.w_max
&& this->aux_len == other.aux_len;
}

/* Pre-defined index parameters that work well for a certain
Expand Down Expand Up @@ -48,7 +49,7 @@ static std::vector<Profile> profiles = {
* k, s, l, u, c and max_seed_len can be used to override determined parameters
* by setting them to a value other than IndexParameters::DEFAULT.
*/
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len) {
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len, int aux_len) {
const int default_c = 8;
size_t canonical_read_length = 50;
for (const auto& p : profiles) {
Expand Down Expand Up @@ -78,8 +79,11 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s,
max_dist = max_seed_len - k; // convert to distance in start positions
}
int q = std::pow(2, c == DEFAULT ? default_c : c) - 1;
if (aux_len == DEFAULT) {
aux_len = 24;
}

return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, aux_len);
}

void IndexParameters::write(std::ostream& os) const {
Expand All @@ -90,6 +94,7 @@ void IndexParameters::write(std::ostream& os) const {
write_int_to_ostream(os, randstrobe.u);
write_int_to_ostream(os, randstrobe.q);
write_int_to_ostream(os, randstrobe.max_dist);
write_int_to_ostream(os, randstrobe.aux_len);
}

IndexParameters IndexParameters::read(std::istream& is) {
Expand All @@ -100,7 +105,8 @@ IndexParameters IndexParameters::read(std::istream& is) {
int u = read_int_from_istream(is);
int q = read_int_from_istream(is);
int max_dist = read_int_from_istream(is);
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
int aux_len = read_int_from_istream(is);
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, aux_len);
}

bool IndexParameters::operator==(const IndexParameters& other) const {
Expand Down
14 changes: 9 additions & 5 deletions src/indexparameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,16 @@ struct RandstrobeParameters {
const int max_dist;
const unsigned w_min;
const unsigned w_max;
const unsigned aux_len;

RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max)
RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max, unsigned aux_len)
: l(l)
, u(u)
, q(q)
, max_dist(max_dist)
, w_min(w_min)
, w_max(w_max)
, aux_len(aux_len)
{
verify();
}
Expand All @@ -65,6 +67,9 @@ struct RandstrobeParameters {
if (w_min > w_max) {
throw BadParameter("w_min is greater than w_max (choose different -l/-u parameters)");
}
if (aux_len > 63) {
throw BadParameter("aux_len size is larger than 63");
}
}
};

Expand All @@ -77,16 +82,15 @@ class IndexParameters {

static const int DEFAULT = std::numeric_limits<int>::min();

IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist)
IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist, int aux_len)
: canonical_read_length(canonical_read_length)
, syncmer(k, s)
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u)
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u, aux_len)
{
}

static IndexParameters from_read_length(
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT
);
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT, int aux_len = DEFAULT);
static IndexParameters read(std::istream& os);
std::string filename_extension() const;
void write(std::ostream& os) const;
Expand Down
3 changes: 2 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,8 @@ int run_strobealign(int argc, char **argv) {
opt.l_set ? opt.l : IndexParameters::DEFAULT,
opt.u_set ? opt.u : IndexParameters::DEFAULT,
opt.c_set ? opt.c : IndexParameters::DEFAULT,
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT,
opt.aux_len ? opt.aux_len : IndexParameters::DEFAULT
);
logger.debug() << index_parameters << '\n';
AlignmentParameters aln_params;
Expand Down
53 changes: 49 additions & 4 deletions src/nam.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "nam.hpp"
static Logger& logger = Logger::get();

namespace {

Expand All @@ -9,7 +10,7 @@ struct Hit {
int ref_end;
};

inline void add_to_hits_per_ref(
inline void add_to_hits_per_ref_full(
robin_hood::unordered_map<unsigned int, std::vector<Hit>>& hits_per_ref,
int query_start,
int query_end,
Expand All @@ -22,7 +23,40 @@ inline void add_to_hits_per_ref(
int ref_end = ref_start + index.strobe2_offset(position) + index.k();
int diff = std::abs((query_end - query_start) - (ref_end - ref_start));
if (diff <= min_diff) {
hits_per_ref[index.reference_index(position)].push_back(Hit{query_start, query_end, ref_start, ref_end});
hits_per_ref[index.reference_index(position)].push_back(
Hit{query_start, query_end, ref_start, ref_end}
);
min_diff = diff;
}
}
}

inline void add_to_hits_per_ref_partial(
robin_hood::unordered_map<unsigned int, std::vector<Hit>>& hits_per_ref,
int query_start,
int query_end,
const StrobemerIndex& index,
size_t position
) {
int min_diff = std::numeric_limits<int>::max();
for (const auto hash = index.get_partial_hash(position);
index.get_partial_hash(position) == hash;
++position) {
bool first_strobe_is_main = index.first_strobe_is_main(position);
// Construct the match from the strobe that was selected as the main part of the hash
int adj_ref_start = 0, adj_ref_end = 0;
if (first_strobe_is_main) {
adj_ref_start = index.get_strobe1_position(position);
}
else {
adj_ref_start = index.get_strobe1_position(position) + index.strobe2_offset(position);
}
adj_ref_end = adj_ref_start + index.k();
int diff = std::abs((query_end - query_start) - (adj_ref_end - adj_ref_start));
if (diff <= min_diff) {
hits_per_ref[index.reference_index(position)].push_back(
Hit{query_start, query_end, adj_ref_start, adj_ref_end}
);
min_diff = diff;
}
}
Expand Down Expand Up @@ -171,7 +205,18 @@ std::pair<float, std::vector<Nam>> find_nams(
continue;
}
nr_good_hits++;
add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, position);
add_to_hits_per_ref_full(hits_per_ref[q.is_reverse], q.start, q.end, index, position);
}
else {
size_t partial_pos = index.partial_find(q.hash);
if (partial_pos != index.end()) {
total_hits++;
if (index.is_partial_filtered(position)) {
continue;
}
nr_good_hits++;
add_to_hits_per_ref_partial(hits_per_ref[q.is_reverse], q.partial_start, q.partial_end, index, partial_pos);
}
}
}
float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0;
Expand Down Expand Up @@ -231,7 +276,7 @@ std::vector<Nam> find_nams_rescue(
if ((rh.count > rescue_cutoff && cnt >= 5) || rh.count > 1000) {
break;
}
add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position);
add_to_hits_per_ref_full(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position);
cnt++;
}
is_revcomp++;
Expand Down
Loading
Loading