Skip to content

Commit

Permalink
handle lower-case nucleotides
Browse files Browse the repository at this point in the history
  • Loading branch information
jermp committed Sep 23, 2023
1 parent 220b2c0 commit b278b8e
Show file tree
Hide file tree
Showing 10 changed files with 163 additions and 108 deletions.
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,6 @@ target_link_libraries(sshash
sshash_static
z
)

# tests
add_executable(test_alphabet test/test_alphabet.cpp)
4 changes: 2 additions & 2 deletions include/buckets.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ struct buckets {
uint64_t offset = id_to_offset(kmer_id, k);
bit_vector_iterator bv_it(strings, 2 * offset);
kmer_t read_kmer = bv_it.read(2 * k);
util::uint_kmer_to_string_no_reverse(read_kmer, string_kmer, k);
util::uint_kmer_to_string(read_kmer, string_kmer, k);
}

struct iterator {
Expand Down Expand Up @@ -196,7 +196,7 @@ struct buckets {
while (offset != next_offset - m_k + 1) {
ret.first = m_kmer_id;
if (clear) {
util::uint_kmer_to_string_no_reverse(read_kmer, ret.second.data(), m_k);
util::uint_kmer_to_string(read_kmer, ret.second.data(), m_k);
} else {
memmove(ret.second.data(), ret.second.data() + 1, m_k - 1);
ret.second[m_k - 1] = util::uint64_to_char(last_two_bits);
Expand Down
2 changes: 1 addition & 1 deletion include/builder/parse_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
while (end != sequence.size() - k + 1) {
char const* kmer = sequence.data() + end;
assert(util::is_valid(kmer, k));
kmer_t uint_kmer = util::string_to_uint_kmer_no_reverse(kmer, k);
kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k);
uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed);

if (build_config.canonical_parsing) {
Expand Down
10 changes: 5 additions & 5 deletions include/dictionary.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ lookup_result dictionary::lookup_uint_canonical_parsing(kmer_t uint_kmer) const
}

uint64_t dictionary::lookup(char const* string_kmer, bool check_reverse_complement) const {
kmer_t uint_kmer = util::string_to_uint_kmer_no_reverse(string_kmer, m_k);
kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k);
return lookup_uint(uint_kmer, check_reverse_complement);
}
uint64_t dictionary::lookup_uint(kmer_t uint_kmer, bool check_reverse_complement) const {
Expand All @@ -66,7 +66,7 @@ uint64_t dictionary::lookup_uint(kmer_t uint_kmer, bool check_reverse_complement

lookup_result dictionary::lookup_advanced(char const* string_kmer,
bool check_reverse_complement) const {
kmer_t uint_kmer = util::string_to_uint_kmer_no_reverse(string_kmer, m_k);
kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k);
return lookup_advanced_uint(uint_kmer, check_reverse_complement);
}
lookup_result dictionary::lookup_advanced_uint(kmer_t uint_kmer,
Expand Down Expand Up @@ -120,7 +120,7 @@ void dictionary::backward_neighbours(kmer_t prefix, neighbourhood& res) const {
}

neighbourhood dictionary::kmer_forward_neighbours(char const* string_kmer) const {
kmer_t uint_kmer = util::string_to_uint_kmer_no_reverse(string_kmer, m_k);
kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k);
return kmer_forward_neighbours(uint_kmer);
}
neighbourhood dictionary::kmer_forward_neighbours(kmer_t uint_kmer) const {
Expand All @@ -131,7 +131,7 @@ neighbourhood dictionary::kmer_forward_neighbours(kmer_t uint_kmer) const {
}

neighbourhood dictionary::kmer_backward_neighbours(char const* string_kmer) const {
kmer_t uint_kmer = util::string_to_uint_kmer_no_reverse(string_kmer, m_k);
kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k);
return kmer_backward_neighbours(uint_kmer);
}
neighbourhood dictionary::kmer_backward_neighbours(kmer_t uint_kmer) const {
Expand All @@ -142,7 +142,7 @@ neighbourhood dictionary::kmer_backward_neighbours(kmer_t uint_kmer) const {
}

neighbourhood dictionary::kmer_neighbours(char const* string_kmer) const {
kmer_t uint_kmer = util::string_to_uint_kmer_no_reverse(string_kmer, m_k);
kmer_t uint_kmer = util::string_to_uint_kmer(string_kmer, m_k);
return kmer_neighbours(uint_kmer);
}
neighbourhood dictionary::kmer_neighbours(kmer_t uint_kmer) const {
Expand Down
5 changes: 2 additions & 3 deletions include/dump.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,8 @@ void dictionary::dump(std::string const& filename) const {
[minimizer_id]:[super_kmer_id]:[minimizer_string]:[position_of_minimizer_in_super_kmer]
*/
out << '>' << bucket_id << ':' << super_kmer_id - begin << ':'
<< util::uint_kmer_to_string_no_reverse(minimizer, m_m) << ':' << pos
<< '\n';
out << util::uint_kmer_to_string_no_reverse(kmer, m_k);
<< util::uint_kmer_to_string(minimizer, m_m) << ':' << pos << '\n';
out << util::uint_kmer_to_string(kmer, m_k);
super_kmer_header_written = true;
} else {
if (minimizer != prev_minimizer) {
Expand Down
4 changes: 2 additions & 2 deletions include/query/streaming_query_canonical_parsing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,9 @@ struct streaming_query_canonical_parsing {
if (!m_start) {
m_kmer >>= 2;
m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift;
assert(m_kmer == util::string_to_uint_kmer_no_reverse(kmer, m_k));
assert(m_kmer == util::string_to_uint_kmer(kmer, m_k));
} else {
m_kmer = util::string_to_uint_kmer_no_reverse(kmer, m_k);
m_kmer = util::string_to_uint_kmer(kmer, m_k);
}
m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start);
assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed));
Expand Down
4 changes: 2 additions & 2 deletions include/query/streaming_query_regular_parsing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ struct streaming_query_regular_parsing {
if (!m_start) {
m_kmer >>= 2;
m_kmer += (util::char_to_uint(kmer[m_k - 1])) << m_shift;
assert(m_kmer == util::string_to_uint_kmer_no_reverse(kmer, m_k));
assert(m_kmer == util::string_to_uint_kmer(kmer, m_k));
} else {
m_kmer = util::string_to_uint_kmer_no_reverse(kmer, m_k);
m_kmer = util::string_to_uint_kmer(kmer, m_k);
}
m_curr_minimizer = m_minimizer_enum.next(m_kmer, m_start);
assert(m_curr_minimizer == util::compute_minimizer(m_kmer, m_k, m_m, m_seed));
Expand Down
139 changes: 50 additions & 89 deletions include/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,11 @@ char decimal binary
C 67 01000-01-1 -> 01
G 71 01000-11-1 -> 11
T 84 01010-10-0 -> 10
a 97 01100-00-1 -> 00
c 99 01100-01-1 -> 01
g 103 01100-11-1 -> 11
t 116 01110-10-0 -> 10
*/
static kmer_t char_to_uint(char c) { return (c >> 1) & 3; }

Expand All @@ -142,89 +147,26 @@ static char uint64_to_char(uint64_t x) {
return nucleotides[x];
}

/*
Traditional mapping.
*/
// uint64_t char_to_uint(char c) {
// switch (c) {
// case 'A':
// return 0;
// case 'C':
// return 1;
// case 'G':
// return 2;
// case 'T':
// return 3;
// }
// assert(false);
// return -1;
// }
// char uint64_to_char(uint64_t x) {
// switch (x) {
// case 0:
// return 'A';
// case 1:
// return 'C';
// case 2:
// return 'G';
// case 3:
// return 'T';
// }
// assert(false);
// return 0;
// }

/****************************************************************************
The following two functions preserves the lexicographic order of k-mers,
that is: if g and t are two k-mers and g < t lexicographically,
then also id(g) < id(t).
*/
[[maybe_unused]] static kmer_t string_to_uint_kmer(char const* str, uint64_t k) {
assert(k <= constants::max_k);
kmer_t x = 0;
for (uint64_t i = 0; i != k; ++i) {
x <<= 2;
x += char_to_uint(str[i]);
}
return x;
}
[[maybe_unused]] static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) {
assert(k <= constants::max_k);
for (int i = k - 1; i >= 0; --i) {
str[i] = uint64_to_char(x & 3);
x >>= 2;
}
}
/****************************************************************************/

[[maybe_unused]] static std::string uint_kmer_to_string(kmer_t x, uint64_t k) {
assert(k <= constants::max_k);
std::string str;
str.resize(k);
uint_kmer_to_string(x, str.data(), k);
return str;
}

[[maybe_unused]] static kmer_t string_to_uint_kmer_no_reverse(char const* str, uint64_t k) {
assert(k <= constants::max_k);
kmer_t x = 0;
for (uint64_t i = 0; i != k; ++i) x += char_to_uint(str[i]) << (2 * i);
return x;
}

static void uint_kmer_to_string_no_reverse(kmer_t x, char* str, uint64_t k) {
static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) {
assert(k <= constants::max_k);
for (uint64_t i = 0; i != k; ++i) {
str[i] = uint64_to_char(x & 3);
x >>= 2;
}
}

[[maybe_unused]] static std::string uint_kmer_to_string_no_reverse(kmer_t x, uint64_t k) {
[[maybe_unused]] static std::string uint_kmer_to_string(kmer_t x, uint64_t k) {
assert(k <= constants::max_k);
std::string str;
str.resize(k);
uint_kmer_to_string_no_reverse(x, str.data(), k);
uint_kmer_to_string(x, str.data(), k);
return str;
}

Expand Down Expand Up @@ -282,31 +224,51 @@ template <bool align>
}
}

// forward character map. A -> A, C -> C, G -> G, T -> T. rest maps to zero.
/*
Forward character map:
A -> A 65
C -> C 67
G -> G 71
T -> T 84
a -> a 97
c -> c 99
g -> g 103
t -> t 116
All other chars map to zero.
*/
static const char canonicalize_basepair_forward_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 97, 0, 99, 0, 0, 0, 103, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
116, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

// reverse character map. A -> T, C -> G, G -> C, T -> A. rest maps to zero.
/*
Reverse character map:
65 A -> T 84
67 C -> G 71
71 G -> C 67
84 T -> A 65
97 a -> t 116
99 c -> g 103
103 g -> c 99
116 t -> a 97
All other chars map to zero.
*/
static const char canonicalize_basepair_reverse_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 65, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 116, 0, 103, 0, 0, 0, 99, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};

[[maybe_unused]] static void compute_reverse_complement(char const* input, char* output,
uint64_t size) {
Expand All @@ -322,7 +284,6 @@ static inline bool is_valid(int c) { return canonicalize_basepair_forward_map[c]
for (uint64_t i = 0; i != size; ++i) {
int c = str[i];
if (canonicalize_basepair_forward_map[c] == 0) return false;
// if (c != 'A' and c != 'C' and c != 'G' and c != 'T') return false;
}
return true;
}
Expand Down
22 changes: 18 additions & 4 deletions src/check_utils.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
#pragma once

#include <algorithm> // for std::transform

#include "../include/gz/zip_stream.hpp"

namespace sshash {
Expand All @@ -11,6 +13,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) {
std::string line;
uint64_t pos = 0;
uint64_t num_kmers = 0;
uint64_t num_lines = 0;
lookup_result prev;
prev.contig_id = 0;

Expand All @@ -25,22 +28,33 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) {
line.clear();
continue;
}

/* transform 50% of the read nucleotides into lower-case letters
(assuming the input is upper-case):
lower-case kmers must be found anyway in the index */
if ((num_lines & 1) == 0) {
std::transform(line.begin(), line.end(), line.begin(),
[](char c) { return std::tolower(c); });
}
++num_lines;

for (uint64_t i = 0; i + k <= line.size(); ++i) {
assert(util::is_valid(line.data() + i, k));
kmer_t uint_kmer = util::string_to_uint_kmer_no_reverse(line.data() + i, k);

kmer_t uint_kmer = util::string_to_uint_kmer(line.data() + i, k);
bool orientation = constants::forward_orientation;

if (num_kmers != 0 and num_kmers % 5000000 == 0) {
std::cout << "checked " << num_kmers << " kmers" << std::endl;
}

/* transform 50% of the kmers into their reverse complements */
if ((num_kmers & 1) == 0) {
/* transform 50% of the kmers into their reverse complements */
uint_kmer = util::compute_reverse_complement(uint_kmer, k);
orientation = constants::backward_orientation;
}

util::uint_kmer_to_string_no_reverse(uint_kmer, expected_kmer_str.data(), k);
util::uint_kmer_to_string(uint_kmer, expected_kmer_str.data(), k);
uint64_t id = dict.lookup(expected_kmer_str.c_str());

/*
Expand Down Expand Up @@ -124,7 +138,7 @@ bool check_correctness_lookup_access(std::istream& is, dictionary const& dict) {

// check access
dict.access(id, got_kmer_str.data());
kmer_t got_uint_kmer = util::string_to_uint_kmer_no_reverse(got_kmer_str.data(), k);
kmer_t got_uint_kmer = util::string_to_uint_kmer(got_kmer_str.data(), k);
kmer_t got_uint_kmer_rc = util::compute_reverse_complement(got_uint_kmer, k);
if (got_uint_kmer != uint_kmer and got_uint_kmer_rc != uint_kmer) {
std::cout << "ERROR: got '" << got_kmer_str << "' but expected '"
Expand Down
Loading

0 comments on commit b278b8e

Please sign in to comment.