Skip to content

Commit

Permalink
Add multi-context seeds
Browse files Browse the repository at this point in the history
  • Loading branch information
Itolstoganov committed Jan 16, 2024
1 parent 7d7b428 commit bf8abf0
Show file tree
Hide file tree
Showing 12 changed files with 115 additions and 25 deletions.
3 changes: 2 additions & 1 deletion src/arguments.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ struct SeedingArguments {
"results with non default values.", {'s'}}
, bits{parser, "INT", "No. of top bits of hash to use as bucket indices (8-31)"
"[determined from reference size]", {'b'}}
, digest{parser, "INT", "Number of bits to be selected from the second strobe hash [24]", {"digest"}}
{
}
args::ArgumentParser& parser;
args::ValueFlag<int> r, m, k, l, u, c, s, bits;
args::ValueFlag<int> r, m, k, l, u, c, s, bits, digest;
};

#endif
1 change: 1 addition & 0 deletions src/cmdline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ CommandLineOptions parse_command_line_arguments(int argc, char **argv) {
if (seeding.s) { opt.s = args::get(seeding.s); opt.s_set = true; }
if (seeding.c) { opt.c = args::get(seeding.c); opt.c_set = true; }
if (seeding.bits) { opt.bits = args::get(seeding.bits); }
if (seeding.digest) { opt.digest = args::get(seeding.digest); }

// Alignment
// if (n) { n = args::get(n); }
Expand Down
1 change: 1 addition & 0 deletions src/cmdline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct CommandLineOptions {
int u { 7 };
int s { 16 };
int c { 8 };
int digest {24};

// Alignment
int A { 2 };
Expand Down
5 changes: 3 additions & 2 deletions src/dumpstrobes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ int run_dumpstrobes(int argc, char **argv) {
}

// Seeding
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7};
int r{150}, k{20}, s{16}, c{8}, l{1}, u{7}, digest{8};
int max_seed_len{};

bool k_set{false}, s_set{false}, c_set{false}, max_seed_len_set{false}, l_set{false}, u_set{false};
Expand All @@ -125,7 +125,8 @@ int run_dumpstrobes(int argc, char **argv) {
l_set ? l : IndexParameters::DEFAULT,
u_set ? u : IndexParameters::DEFAULT,
c_set ? c : IndexParameters::DEFAULT,
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT
max_seed_len_set ? max_seed_len : IndexParameters::DEFAULT,
digest ? digest: IndexParameters::DEFAULT
);

logger.info() << index_parameters << '\n';
Expand Down
2 changes: 2 additions & 0 deletions src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ void StrobemerIndex::populate(float f, size_t n_threads) {
}
stats.tot_strobemer_count = total_randstrobes;

logger.info() << "Digest parameter is : " << parameters.randstrobe.digest;

logger.debug() << " Total number of randstrobes: " << total_randstrobes << '\n';
uint64_t memory_bytes = references.total_length() + sizeof(RefRandstrobe) * total_randstrobes + sizeof(bucket_index_t) * (1u << bits);
logger.debug() << " Estimated total memory usage: " << memory_bytes / 1E9 << " GB\n";
Expand Down
39 changes: 39 additions & 0 deletions src/index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "refs.hpp"
#include "randstrobes.hpp"
#include "indexparameters.hpp"
#include "logger.hpp"


struct IndexCreationStatistics {
Expand All @@ -38,6 +39,7 @@ struct StrobemerIndex {
using bucket_index_t = uint64_t;
StrobemerIndex(const References& references, const IndexParameters& parameters, int bits=-1)
: filter_cutoff(0)
, partial_filter_cutoff(2000) //fixme initialize somewhere else
, parameters(parameters)
, references(references)
, bits(bits == -1 ? pick_bits(references.total_length()) : bits)
Expand All @@ -47,6 +49,7 @@ struct StrobemerIndex {
}
}
unsigned int filter_cutoff;
unsigned int partial_filter_cutoff;
mutable IndexCreationStatistics stats;

void write(const std::string& filename) const;
Expand Down Expand Up @@ -80,6 +83,37 @@ struct StrobemerIndex {
return end();
}

//Returns the first entry that matches the first strobe subhash (if using multi-context seeds)
size_t partial_find(randstrobe_hash_t key) const {
uint strobe2_digest = parameters.randstrobe.digest;
randstrobe_hash_t key_prefix = key >> strobe2_digest;

constexpr int MAX_LINEAR_SEARCH = 4;
const unsigned int top_N = key >> (64 - bits);
bucket_index_t position_start = randstrobe_start_indices[top_N];
bucket_index_t position_end = randstrobe_start_indices[top_N + 1];
if (position_start == position_end) {
return end();
}

if (position_end - position_start < MAX_LINEAR_SEARCH) {
for ( ; position_start < position_end; ++position_start) {
if (randstrobes[position_start].hash >> strobe2_digest == key_prefix) return position_start;
if (randstrobes[position_start].hash >> strobe2_digest > key_prefix) return end();
}
return end();
}
auto cmp = [&strobe2_digest](const RefRandstrobe lhs, const RefRandstrobe rhs) {
return (lhs.hash >> strobe2_digest) < (rhs.hash >> strobe2_digest); };

auto pos = std::lower_bound(randstrobes.begin() + position_start,
randstrobes.begin() + position_end,
RefRandstrobe{key, 0, 0},
cmp);
if (pos->hash == key) return pos - randstrobes.begin();
return end();
}

randstrobe_hash_t get_hash(bucket_index_t position) const {
if (position < randstrobes.size()) {
return randstrobes[position].hash;
Expand All @@ -92,6 +126,11 @@ struct StrobemerIndex {
return get_hash(position) == get_hash(position + filter_cutoff);
}

bool is_partial_filtered(bucket_index_t position) const {
uint shift = parameters.randstrobe.digest;
return (get_hash(position) >> shift) == (get_hash(position + partial_filter_cutoff) >> shift);
}

unsigned int get_strobe1_position(bucket_index_t position) const {
return randstrobes[position].position;
}
Expand Down
29 changes: 18 additions & 11 deletions src/indexparameters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ bool RandstrobeParameters::operator==(const RandstrobeParameters& other) const {
&& this->q == other.q
&& this->max_dist == other.max_dist
&& this->w_min == other.w_min
&& this->w_max == other.w_max;
&& this->w_max == other.w_max
&& this->digest == other.digest;
}

/* Pre-defined index parameters that work well for a certain
Expand All @@ -30,24 +31,25 @@ struct Profile {
int s_offset;
int l;
int u;
int digest;
};

static auto max{std::numeric_limits<int>::max()};

static std::vector<Profile> profiles = {
Profile{ 50, 90, 18, -4, -2, 1},
Profile{100, 110, 20, -4, -2, 2},
Profile{125, 135, 20, -4, -1, 4},
Profile{150, 175, 20, -4, 1, 7},
Profile{250, 375, 22, -4, 2, 12},
Profile{400, max, 23, -6, 2, 12},
Profile{ 50, 90, 18, -4, -2, 1, 24},
Profile{100, 110, 20, -4, -2, 2, 24},
Profile{125, 135, 20, -4, -1, 4, 24},
Profile{150, 175, 20, -4, 1, 7, 24},
Profile{250, 375, 22, -4, 2, 12, 24},
Profile{400, max, 23, -6, 2, 12, 24},
};

/* Create an IndexParameters instance based on a given read length.
* k, s, l, u, c and max_seed_len can be used to override determined parameters
* by setting them to a value other than IndexParameters::DEFAULT.
*/
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len) {
IndexParameters IndexParameters::from_read_length(int read_length, int k, int s, int l, int u, int c, int max_seed_len, int digest) {
const int default_c = 8;
size_t canonical_read_length = 50;
for (const auto& p : profiles) {
Expand All @@ -64,6 +66,9 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s,
if (u == DEFAULT) {
u = p.u;
}
if (digest == DEFAULT) {
digest = p.digest;
}
canonical_read_length = p.canonical_read_length;
break;
}
Expand All @@ -78,7 +83,7 @@ IndexParameters IndexParameters::from_read_length(int read_length, int k, int s,
}
int q = std::pow(2, c == DEFAULT ? default_c : c) - 1;

return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, digest);
}

void IndexParameters::write(std::ostream& os) const {
Expand All @@ -89,6 +94,7 @@ void IndexParameters::write(std::ostream& os) const {
write_int_to_ostream(os, randstrobe.u);
write_int_to_ostream(os, randstrobe.q);
write_int_to_ostream(os, randstrobe.max_dist);
write_int_to_ostream(os, randstrobe.digest);
}

IndexParameters IndexParameters::read(std::istream& is) {
Expand All @@ -99,7 +105,8 @@ IndexParameters IndexParameters::read(std::istream& is) {
int u = read_int_from_istream(is);
int q = read_int_from_istream(is);
int max_dist = read_int_from_istream(is);
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist);
int digest = read_int_from_istream(is);
return IndexParameters(canonical_read_length, k, s, l, u, q, max_dist, digest);
}

bool IndexParameters::operator==(const IndexParameters& other) const {
Expand All @@ -119,7 +126,7 @@ std::string IndexParameters::filename_extension() const {
// nothing was overridden
sstream << ".r" << canonical_read_length;
}
sstream << ".sti";
sstream << ".stimc";
return sstream.str();
}

Expand Down
13 changes: 9 additions & 4 deletions src/indexparameters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,16 @@ struct RandstrobeParameters {
const int max_dist;
const unsigned w_min;
const unsigned w_max;
const unsigned digest;

RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max)
RandstrobeParameters(int l, int u, uint64_t q, int max_dist, unsigned w_min, unsigned w_max, unsigned digest)
: l(l)
, u(u)
, q(q)
, max_dist(max_dist)
, w_min(w_min)
, w_max(w_max)
, digest(digest)
{
verify();
}
Expand All @@ -65,6 +67,9 @@ struct RandstrobeParameters {
if (w_min > w_max) {
throw BadParameter("w_min is greater than w_max (choose different -l/-u parameters)");
}
if (digest > 63) {
throw BadParameter("digest size is larger than 63");
}
}
};

Expand All @@ -77,15 +82,15 @@ class IndexParameters {

static const int DEFAULT = std::numeric_limits<int>::min();

IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist)
IndexParameters(size_t canonical_read_length, int k, int s, int l, int u, int q, int max_dist, int digest)
: canonical_read_length(canonical_read_length)
, syncmer(k, s)
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u)
, randstrobe(l, u, q, max_dist, std::max(0, k / (k - s + 1) + l), k / (k - s + 1) + u, digest)
{
}

static IndexParameters from_read_length(
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT
int read_length, int k = DEFAULT, int s = DEFAULT, int l = DEFAULT, int u = DEFAULT, int c = DEFAULT, int max_seed_len = DEFAULT, int digest = DEFAULT
);
static IndexParameters read(std::istream& os);
std::string filename_extension() const;
Expand Down
3 changes: 2 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,8 @@ int run_strobealign(int argc, char **argv) {
opt.l_set ? opt.l : IndexParameters::DEFAULT,
opt.u_set ? opt.u : IndexParameters::DEFAULT,
opt.c_set ? opt.c : IndexParameters::DEFAULT,
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT
opt.max_seed_len_set ? opt.max_seed_len : IndexParameters::DEFAULT,
opt.digest ? opt.digest : IndexParameters::DEFAULT
);
logger.debug() << index_parameters << '\n';
AlignmentParameters aln_params;
Expand Down
23 changes: 19 additions & 4 deletions src/nam.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "nam.hpp"
static Logger& logger = Logger::get();

namespace {

Expand All @@ -14,12 +15,14 @@ inline void add_to_hits_per_ref(
int query_start,
int query_end,
const StrobemerIndex& index,
size_t position
size_t position,
bool is_partial
) {
int min_diff = std::numeric_limits<int>::max();
for (const auto hash = index.get_hash(position); index.get_hash(position) == hash; ++position) {
int ref_start = index.get_strobe1_position(position);
int ref_end = ref_start + index.strobe2_offset(position) + index.k();
uint strobe2_position = is_partial ? 0 : index.strobe2_offset(position);
int ref_end = ref_start + strobe2_position + index.k();
int diff = std::abs((query_end - query_start) - (ref_end - ref_start));
if (diff <= min_diff) {
hits_per_ref[index.reference_index(position)].push_back(Hit{query_start, query_end, ref_start, ref_end});
Expand Down Expand Up @@ -171,7 +174,17 @@ std::pair<float, std::vector<Nam>> find_nams(
continue;
}
nr_good_hits++;
add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, position);
add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, position, false);
} else {
size_t partial_pos = index.partial_find(q.hash);
if (partial_pos != index.end()) {
total_hits++;
if (index.is_partial_filtered(position)) {
continue;
}
nr_good_hits++;
add_to_hits_per_ref(hits_per_ref[q.is_reverse], q.start, q.end, index, partial_pos, true);
}
}
}
float nonrepetitive_fraction = total_hits > 0 ? ((float) nr_good_hits) / ((float) total_hits) : 1.0;
Expand Down Expand Up @@ -231,12 +244,14 @@ std::vector<Nam> find_nams_rescue(
if ((rh.count > rescue_cutoff && cnt >= 5) || rh.count > 1000) {
break;
}
add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position);
add_to_hits_per_ref(hits_per_ref[is_revcomp], rh.query_start, rh.query_end, index, rh.position, false);
cnt++;
}
is_revcomp++;
}



return merge_hits_into_nams_forward_and_reverse(hits_per_ref, index.k(), true);
}

Expand Down
16 changes: 14 additions & 2 deletions src/randstrobes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ static inline randstrobe_hash_t randstrobe_hash(syncmer_hash_t hash1, syncmer_ha
return hash1 + hash2;
}

static inline digest_hash_t digest_hash(syncmer_hash_t hash1, syncmer_hash_t hash2, size_t digest_size) {
syncmer_hash_t main_hash = std::min(hash1, hash2);
syncmer_hash_t aux_hash = std::max(hash1, hash2);
return ((main_hash >> digest_size) << digest_size) ^ (aux_hash >> (64 - digest_size));
}

std::ostream& operator<<(std::ostream& os, const Syncmer& syncmer) {
os << "Syncmer(hash=" << syncmer.hash << ", position=" << syncmer.position << ")";
return os;
Expand Down Expand Up @@ -167,7 +173,10 @@ Randstrobe RandstrobeIterator::get(unsigned int strobe1_index) const {
}
}

return Randstrobe{randstrobe_hash(strobe1.hash, strobe2.hash), static_cast<uint32_t>(strobe1.position), static_cast<uint32_t>(strobe2.position)};
// return Randstrobe{randstrobe_hash(strobe1.hash, strobe2.hash), static_cast<uint32_t>(strobe1.position), static_cast<uint32_t>(strobe2.position)};
return Randstrobe{digest_hash(strobe1.hash, strobe2.hash, digest),
static_cast<uint32_t>(strobe1.position),
static_cast<uint32_t>(strobe2.position)};
}

Randstrobe RandstrobeGenerator::next() {
Expand Down Expand Up @@ -198,7 +207,10 @@ Randstrobe RandstrobeGenerator::next() {
}
}
syncmers.pop_front();
return Randstrobe{randstrobe_hash(strobe1.hash, strobe2.hash), static_cast<uint32_t>(strobe1.position), static_cast<uint32_t>(strobe2.position)};
// return Randstrobe{randstrobe_hash(strobe1.hash, strobe2.hash), static_cast<uint32_t>(strobe1.position), static_cast<uint32_t>(strobe2.position)};
return Randstrobe{digest_hash(strobe1.hash, strobe2.hash, digest),
static_cast<uint32_t>(strobe1.position),
static_cast<uint32_t>(strobe2.position)};
}

/*
Expand Down
Loading

0 comments on commit bf8abf0

Please sign in to comment.