Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add VCF/BCF support for POS=0 coordinate #1573

Open
wants to merge 7 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 101 additions & 37 deletions hts.c
Original file line number Diff line number Diff line change
Expand Up @@ -2900,21 +2900,55 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx)
*** Iterator ***
****************/

static inline int grow_itr_bins(hts_itr_t *itr, int num)
{
if (itr->bins.m - itr->bins.n < num) {
size_t new_n = (size_t) itr->bins.n + num;
size_t new_m = (size_t) itr->bins.m + (itr->bins.m >> 1);
if (new_m < new_n)
new_m = new_n;
if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) {
errno = ENOMEM;
return -1;
}
int *new_a = realloc(itr->bins.a, sizeof(int) * new_m);
if (!new_a)
return -1;
itr->bins.m = (int) new_m;
itr->bins.a = new_a;
}
return 0;
}

// Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<<s.
static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls)
{
int l, t, s = min_shift + (n_lvls<<1) + n_lvls;
if (beg >= end) return 0;
if (beg < 0 ) {
// Deal with historic difference of opinion about which bin the
// range [-1, 0) should end up in. HTSlib since 2016 says the
// smallest position 0 bin; before that it would crash when trying
// to make the index. htsjdk and the SAM specification say
// they should go into the bin before this instead, which is the
// largest one on the previous level - and all other ranges
// starting -1 will go into bin 0 (the SAM spec. lists other
// possibilities, but they can't actually occur). To ensure
// compatibility, we need to add this extra bin to the search list
// as a special case.
beg = 0;
if (end == 0) end = 1;
if (grow_itr_bins(itr, 1) < 0)
return -1;
itr->bins.a[itr->bins.n++] = hts_bin_first(n_lvls) - 1;
}
if (end >= 1LL<<s) end = 1LL<<s;
for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) {
hts_pos_t b, e;
int n, i;
b = t + (beg>>s); e = t + (end>>s); n = e - b + 1;
if (itr->bins.n + n > itr->bins.m) {
itr->bins.m = itr->bins.n + n;
kroundup32(itr->bins.m);
itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m);
}
if (grow_itr_bins(itr, n) < 0)
return -1;
for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i;
}
return itr->bins.n;
Expand Down Expand Up @@ -3086,7 +3120,9 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
} else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) {
iter->finished = 1;
} else {
if (beg < 0) beg = 0;
if (beg < -1) beg = -1;
if (end >= 1LL << (idx->min_shift + 3 * idx->n_lvls))
end = (1LL << (idx->min_shift + 3 * idx->n_lvls)) - 1;
if (end < beg) {
free(iter);
return NULL;
Expand All @@ -3103,7 +3139,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t

if ( !kh_size(bidx) ) { iter->finished = 1; return iter; }

rel_off = beg>>idx->min_shift;
rel_off = (beg>=0 ? beg : 0) >> idx->min_shift;
// compute min_off
bin = hts_bin_first(idx->n_lvls) + rel_off;
do {
Expand All @@ -3116,6 +3152,15 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
} while (bin);
if (bin == 0) k = kh_get(bin, bidx, bin);
min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0;

if (beg < 0 && min_off > 0) {
// Regions [-1, 0) may be in hts_bin_first(idx->n_lvls) - 1
// which could give a lower min_off
k = kh_get(bin, bidx, hts_bin_first(idx->n_lvls) - 1);
if (k != kh_end(bidx) && kh_val(bidx, k).loff < min_off)
min_off = kh_val(bidx, k).loff;
}

// min_off can be calculated more accurately if the
// linear index is available
if (idx->lidx[tid].offset
Expand All @@ -3138,8 +3183,6 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
min_off = kh_val(bidx, k).loff;
}
} else if (unmapped) { //CSI index
if (k != kh_end(bidx))
min_off = kh_val(bidx, k).loff;
}

// compute max_off: a virtual offset from a bin to the right of end
Expand All @@ -3157,7 +3200,10 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t
}

// retrieve bins
reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls);
if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls) < 0) {
hts_itr_destroy(iter);
return NULL;
}

for (i = n_off = 0; i < iter->bins.n; ++i)
if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx))
Expand Down Expand Up @@ -3615,6 +3661,7 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,

const char *colon = NULL, *comma = NULL;
int quoted = 0;
hts_pos_t first_base = flags & HTS_PARSE_REF_START_MINUS_1 ? -1 : 0;

if (flags & HTS_PARSE_LIST)
flags &= ~HTS_PARSE_THOUSANDS_SEP;
Expand Down Expand Up @@ -3660,7 +3707,7 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,

// No colon is simplest case; just check and return.
if (colon == NULL) {
*beg = 0; *end = HTS_POS_MAX;
*beg = first_base; *end = HTS_POS_MAX;
kputsn(s, s_len-quoted, &ks); // convert to nul terminated string
if (!ks.s) {
*tid = -2;
Expand All @@ -3675,7 +3722,7 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,

// Has a colon, but check whole name first.
if (!quoted) {
*beg = 0; *end = HTS_POS_MAX;
*beg = first_base; *end = HTS_POS_MAX;
kputsn(s, s_len, &ks); // convert to nul terminated string
if (!ks.s) {
*tid = -2;
Expand Down Expand Up @@ -3722,39 +3769,46 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg,

// Finally parse the post-colon coordinates
char *hyphen;
*beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
if (*beg < 0) {
if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') {
// User specified zero, but we're 1-based.
hts_log_error("Coordinates must be > 0");
return NULL;
}
if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') {
// interpret chr:-100 as chr:1-100
*end = *beg==-1 ? HTS_POS_MAX : -(*beg+1);
*beg = 0;
return s_end;
} else if (*beg < -1) {
hts_log_error("Unexpected string \"%s\" after region", hyphen);
return NULL;
const char *from_pos = colon + 1;
int have_start_pos = 0;
while (isspace_c(*from_pos)) from_pos++;
if (*from_pos == '-') {
// interpret chr:-100 as chr:1-100
*beg = first_base;
hyphen = (char *) from_pos;
} else {
*beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1;
if (hyphen != colon+1) {
have_start_pos = 1;
} else {
*beg = first_base;
}
}

if (*hyphen == '\0' || ((flags & HTS_PARSE_LIST) && *hyphen == ',')) {
*end = flags & HTS_PARSE_ONE_COORD ? *beg+1 : HTS_POS_MAX;
} else if (*hyphen == '-') {
*end = hts_parse_decimal(hyphen+1, &hyphen, flags);
if (*hyphen != '\0' && *hyphen != ',') {
hts_log_error("Unexpected string \"%s\" after region", hyphen);
return NULL;
}
} else {
return s_end;
} else if (*hyphen != '-') {
hts_log_error("Unexpected string \"%s\" after region", hyphen);
return NULL;
}

if (*end == 0)
*end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
char *last;
*end = hts_parse_decimal(hyphen+1, &last, flags);
if (!have_start_pos &&
(*last == '-' || *end < 0 || (flags & HTS_PARSE_ONE_COORD))) {
// Attempt to use a negative position, e.g. chr:-100-101
// or chr:--100
hts_log_error("Coordinates must be > 0");
return NULL;
} else if (last == hyphen + 1) {
if (*last != '\0' && !((flags & HTS_PARSE_LIST) && *last == ',')) {
hts_log_error("Unexpected string \"%s\" after region", last);
return NULL;
} else {
*end = HTS_POS_MAX; // interpret chr:100- as chr:100-<end>
}
}

if (*beg >= *end) return NULL;

Expand Down Expand Up @@ -3806,15 +3860,25 @@ const char *hts_parse_reg(const char *s, int *beg, int *end)

hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f getid, void *hdr, hts_itr_query_func *itr_query, hts_readrec_func *readrec)
{
extern int bcf_readrec(BGZF *fp, void *null, void *v, int *tid, hts_pos_t *beg, hts_pos_t *end);
int tid;
hts_pos_t beg, end;
int flags = HTS_PARSE_THOUSANDS_SEP;

if (strcmp(reg, ".") == 0)
return itr_query(idx, HTS_IDX_START, 0, 0, readrec);
else if (strcmp(reg, "*") == 0)
return itr_query(idx, HTS_IDX_NOCOOR, 0, 0, readrec);

if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, HTS_PARSE_THOUSANDS_SEP))
if (readrec == bcf_readrec) {
flags |= HTS_PARSE_REF_START_MINUS_1;
} else if (readrec == tbx_readrec) {
tbx_t *tbx = (tbx_t *) hdr;
if ((tbx->conf.preset&0xffff) == TBX_VCF)
flags |= HTS_PARSE_REF_START_MINUS_1;
}

if (!hts_parse_region(reg, &tid, &beg, &end, getid, hdr, flags))
return NULL;

return itr_query(idx, tid, beg, end, readrec);
Expand Down
1 change: 1 addition & 0 deletions htslib/hts.h
Original file line number Diff line number Diff line change
Expand Up @@ -1132,6 +1132,7 @@ int hts_idx_nseq(const hts_idx_t *idx);
#define HTS_PARSE_THOUSANDS_SEP 1 ///< Ignore ',' separators within numbers
#define HTS_PARSE_ONE_COORD 2 ///< chr:pos means chr:pos-pos and not chr:pos-end
#define HTS_PARSE_LIST 4 ///< Expect a comma separated list of regions. (Disables HTS_PARSE_THOUSANDS_SEP)
#define HTS_PARSE_REF_START_MINUS_1 8 ///< Return -1 for the start of a complete reference (mainly for VCF)

/// Parse a numeric string
/** The number may be expressed in scientific notation, and optionally may
Expand Down
2 changes: 2 additions & 0 deletions htslib/synced_bcf_reader.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ DEALINGS IN THE SOFTWARE. */
extern "C" {
#endif

#define CSI_COOR_EMPTY -2

/*
When reading multiple files in parallel, duplicate records within each
file will be reordered and offered in intuitive order. For example,
Expand Down
48 changes: 29 additions & 19 deletions synced_bcf_reader.c
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ int bcf_sr_add_reader(bcf_srs_t *files, const char *fname)
if ( !files->regions )
files->regions = _regions_init_string(names[i]);
else
_regions_add(files->regions, names[i], -1, -1);
_regions_add(files->regions, names[i], CSI_COOR_EMPTY, CSI_COOR_EMPTY);
}
free(names);
_regions_sort_and_merge(files->regions);
Expand Down Expand Up @@ -555,7 +555,7 @@ static int _readers_next_region(bcf_srs_t *files)
int prev_iseq = files->regions->iseq;
hts_pos_t prev_end = files->regions->end;
if ( bcf_sr_regions_next(files->regions)<0 ) return -1;
files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : -1;
files->regions->prev_end = prev_iseq==files->regions->iseq ? prev_end : CSI_COOR_EMPTY;

for (i=0; i<files->nreaders; i++)
_reader_seek(&files->readers[i],files->regions->seq_names[files->regions->iseq],files->regions->start,files->regions->end);
Expand Down Expand Up @@ -616,7 +616,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
{
reader->buffer[reader->mbuffer-i] = bcf_init1();
reader->buffer[reader->mbuffer-i]->max_unpack = files->max_unpack;
reader->buffer[reader->mbuffer-i]->pos = -1; // for rare cases when VCF starts from 1
reader->buffer[reader->mbuffer-i]->pos = CSI_COOR_EMPTY; // for rare cases when VCF starts from 1 or 0
}
}
if ( files->streaming )
Expand Down Expand Up @@ -656,7 +656,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
if ( ret < 0 ) break; // no more lines or an error
bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]);
}

// Prevent creation of duplicates from records overlapping multiple regions
// and recognize true variant overlaps vs record overlaps (e.g. TA>T vs A>-)
if ( files->regions )
Expand All @@ -676,7 +675,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader)
hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap);
exit(1);
}

if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue;
}

Expand Down Expand Up @@ -954,9 +952,9 @@ int bcf_sr_set_samples(bcf_srs_t *files, const char *fname, int is_file)
// inclusive. Sorting and merging step needed afterwards: qsort(..,cmp_regions) and merge_regions().
static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, hts_pos_t end)
{
if ( start==-1 && end==-1 )
if ( start==CSI_COOR_EMPTY && end==CSI_COOR_EMPTY )
{
start = 0; end = MAX_CSI_COOR-1;
start = -1; end = MAX_CSI_COOR-1;
}
else
{
Expand Down Expand Up @@ -1029,8 +1027,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg)
static bcf_sr_regions_t *_regions_init_string(const char *str)
{
bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
reg->start = reg->end = -1;
reg->prev_start = reg->prev_end = reg->prev_seq = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
reg->prev_start = reg->prev_end = CSI_COOR_EMPTY;
reg->prev_seq = -1;

kstring_t tmp = {0,0,0};
const char *sp = str, *ep = str;
Expand All @@ -1043,11 +1042,20 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
if ( *ep==':' )
{
sp = ep+1;
from = hts_parse_decimal(sp,(char**)&ep,0);
if ( sp==ep )
while (isspace((uint8_t) *sp)) sp++;
if (*sp == '-')
{
hts_log_error("Could not parse the region(s): %s", str);
free(reg); free(tmp.s); return NULL;
ep = sp;
from = -1;
}
else
{
from = hts_parse_decimal(sp,(char**)&ep,0);
if ( sp==ep )
{
hts_log_error("Could not parse the region(s): %s", str);
free(reg); free(tmp.s); return NULL;
}
}
if ( !*ep || *ep==',' )
{
Expand Down Expand Up @@ -1075,7 +1083,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str)
}
else
{
if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1);
if ( tmp.l ) _regions_add(reg, tmp.s, CSI_COOR_EMPTY, CSI_COOR_EMPTY);
if ( !*ep ) break;
sp = ++ep;
}
Expand Down Expand Up @@ -1157,8 +1165,9 @@ bcf_sr_regions_t *bcf_sr_regions_init(const char *regions, int is_file, int ichr
}

reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t));
reg->start = reg->end = -1;
reg->prev_start = reg->prev_end = reg->prev_seq = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
reg->prev_start = reg->prev_end = CSI_COOR_EMPTY;
reg->prev_seq = -1;

reg->file = hts_open(regions, "rb");
if ( !reg->file )
Expand Down Expand Up @@ -1253,7 +1262,8 @@ void bcf_sr_regions_destroy(bcf_sr_regions_t *reg)

int bcf_sr_regions_seek(bcf_sr_regions_t *reg, const char *seq)
{
reg->iseq = reg->start = reg->end = -1;
reg->iseq = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
if ( khash_str2int_get(reg->seq_hash, seq, &reg->iseq) < 0 ) return -1; // sequence seq not in regions

// using in-memory regions
Expand Down Expand Up @@ -1284,7 +1294,7 @@ static int advance_creg(region_t *reg)
int bcf_sr_regions_next(bcf_sr_regions_t *reg)
{
if ( reg->iseq<0 ) return -1;
reg->start = reg->end = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
reg->nals = 0;

// using in-memory regions
Expand Down Expand Up @@ -1442,7 +1452,7 @@ static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_p
bcf_sr_regions_flush(reg);

bcf_sr_regions_seek(reg, seq);
reg->start = reg->end = -1;
reg->start = reg->end = CSI_COOR_EMPTY;
}
if ( reg->prev_seq==iseq && reg->iseq!=iseq ) return -2; // no more regions on this chromosome
reg->prev_seq = reg->iseq;
Expand Down
Loading