Skip to content

Commit

Permalink
Merge branch 'mapq' into 'dev'
Browse files Browse the repository at this point in the history
Plumb in mapQ [CW-1588]

See merge request epi2melabs/modbam2bed!25
  • Loading branch information
cjw85 committed Feb 15, 2023
2 parents bf7f069 + d195807 commit a3db13b
Show file tree
Hide file tree
Showing 12 changed files with 45 additions and 19 deletions.
1 change: 1 addition & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ bld:api-test:
- *prep-image
- *minimal-python
- make test_api
- make test_python
deploy-checks:
Expand Down
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [v0.9.1]
### Added
- `--map_q` command line option to filter reads by minimum mapping quality.
### Fixed
- The default mapping quality was erroneously 1 not 0.

## [v0.9.0]
### Added
- "Other" modified base column to extended output. For example, when using
Expand Down
2 changes: 1 addition & 1 deletion build.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
mplp_data *create_bam_iter_data(
const bam_fset* fset, const char *chr, int start, int end,
const char *read_group, const char tag_name[2], const int tag_value);
const char *read_group, const char tag_name[2], const int tag_value, const int min_mapq);
void destroy_bam_iter_data(mplp_data *data);
// iterate a file
int read_bam(void *data, bam1_t *b);
Expand Down
24 changes: 15 additions & 9 deletions modbampy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import libmodbampy

# remember to bump version in src/version.h too
__version__ = "0.9.0"
__version__ = "0.9.1"
ffi = libmodbampy.ffi
libbam = libmodbampy.lib

Expand Down Expand Up @@ -49,16 +49,17 @@ class ModBase:
"""

def __init__(self, code, base=None, name="unknown", abbrev="unknown"):
"""Initialise the instance."""
self._name = ffi.new("char[]", name.encode())
self._abbrev= ffi.new("char[]", abbrev.encode())
self._abbrev = ffi.new("char[]", abbrev.encode())
self._base = base
err = TypeError(
"'base' should be a single character or None")
if isinstance(self._base, str):
if len(self._base) != 1:
raise err
self._base = base.encode()
self._base_i = {"A":1, "C":2, "G":4, "T":8}[base]
self._base_i = {"A": 1, "C": 2, "G": 4, "T": 8}[base]
elif self._base is not None:
raise err

Expand All @@ -76,6 +77,7 @@ def __init__(self, code, base=None, name="unknown", abbrev="unknown"):

@property
def struct(self):
"""Return a list compatible with C structure."""
for i in range(libbam.n_mod_bases):
if libbam.mod_bases[i].code == self._code:
return libbam.mod_bases[i]
Expand Down Expand Up @@ -114,7 +116,7 @@ def __exit__(self, type, value, traceback):

def reads(
self, chrom, start, end,
read_group=None, tag_name=None, tag_value=None):
read_group=None, tag_name=None, tag_value=None, min_mapq=0):
"""Iterate over (filtered) alignments in file.
:param chrom: reference sequence from BAM.
Expand All @@ -123,14 +125,15 @@ def reads(
:param read group: read group of read to return.
:param tag_name: read tag to check during read filtering.
:param tag_value: tag value for reads to keep.
:param min_mapq: minimum read mapping quality.
"""
read_group, tag_name, tag_value = _tidy_args(
read_group, tag_name, tag_value)

data = ffi.gc(
libbam.create_bam_iter_data(
self._bam_fset, chrom.encode(), start, end,
read_group, tag_name, tag_value),
read_group, tag_name, tag_value, min_mapq),
libbam.destroy_bam_iter_data)
mod_state = ffi.gc(
libbam.hts_base_mod_state_alloc(),
Expand All @@ -144,7 +147,8 @@ def pileup(
self, chrom, start, end,
read_group=None, tag_name=None, tag_value=None,
low_threshold=0.33, high_threshold=0.66, threshold=0.66,
mod_base="m", max_depth=None, canon_base=None, combine=False):
mod_base="m", max_depth=None, canon_base=None, combine=False,
min_mapq=0):
"""Create a base count matrix.
:param chrom: reference sequence from BAM.
Expand All @@ -153,14 +157,16 @@ def pileup(
:param read group: read group of read to return.
:param tag_name: read tag to check during read filtering.
:param tag_value: tag value for reads to keep.
:param threshold: probability filter threshold for excluding calls from counts.
:param threshold: probability filter threshold for excluding
calls from counts.
:param mod_base: ChEBI code of modified base to examine.
:param max_depth: maximum read depth to examine.
:param canon_base: canonical base corresponding to `mod_base`.
Required only if `mod_base is not a modification known to
Required only if `mod_base` is not a modification known to
the code.
:param combine: combine (include) all alternative modifications
with the same parent canonical base.
:param min_mapq: minimum read mapping quality.
"""
for thresh in (low_threshold, high_threshold):
if thresh < 0.0 or thresh > 1.0:
Expand All @@ -181,7 +187,7 @@ def pileup(
fsets, chrom.encode(), start, end,
read_group, tag_name, tag_value,
threshold, mod_base.struct,
combine, max_depth)
combine, max_depth, min_mapq)
# TODO: check for NULL

# copy data to numpy, we could be more clever here an wrap
Expand Down
6 changes: 6 additions & 0 deletions src/args.c
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ static struct argp_option options[] = {
"Only process reads with a given tag value.", 3},
{"haplotype", 0x300, "VAL", 0,
"Only process reads from a given haplotype. Equivalent to --tag_name HP --tag_value VAL.", 3},
{"map_q", 0x900, "QUAL", 0,
"Filter reads below this mapping quality.", 3},
{ 0 }
};

Expand Down Expand Up @@ -169,6 +171,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state) {
tag_items += 2;
hp_given = true;
break;
case 0x900:
arguments->min_mapQ = atoi(arg);
break;
case 't':
arguments->threads = atoi(arg);
break;
Expand Down Expand Up @@ -232,6 +237,7 @@ arguments_t parse_arguments(int argc, char** argv) {
args.prefix = "mod-counts";
args.pileup = false;
args.hts_maxcnt = INT_MAX;
args.min_mapQ = 0;
argp_parse(&argp, argc, argv, 0, 0, &args);
// allow CpG only for C!
if (args.cpg || args.chh || args.chg) {
Expand Down
1 change: 1 addition & 0 deletions src/args.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ typedef struct arguments {
char* prefix;
bool pileup;
int hts_maxcnt;
int min_mapQ;
} arguments_t;

arguments_t parse_arguments(int argc, char** argv);
Expand Down
5 changes: 3 additions & 2 deletions src/bamiter.c
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ void destroy_filesets(set_fsets *s) {
*/
mplp_data *create_bam_iter_data(
const bam_fset* bam_set, const char *chr, int start, int end,
const char *read_group, const char tag_name[2], const int tag_value) {
const char *read_group, const char tag_name[2], const int tag_value,
const int min_mapQ) {

// open bam etc.
// this is all now deferred to the caller
Expand All @@ -96,7 +97,7 @@ mplp_data *create_bam_iter_data(
data->fp = fp; data->idx = idx; data->hdr = hdr;
data->iter = bam_itr_queryi(idx, mytid, start, end);
memcpy(data->tag_name, tag_name, 2); data->tag_value = tag_value;
data->min_mapQ = 1; data->read_group = read_group;
data->min_mapQ = min_mapQ; data->read_group = read_group;

return data;
}
Expand Down
4 changes: 3 additions & 1 deletion src/bamiter.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,15 @@ void destroy_filesets(set_fsets *s);
* @param read_group by which to filter alignments.
* @param tag_name by which to filter alignments.
* @param tag_value associated with tag_name.
* @param min_mapQ minimum mapping quality of reads.
*
* The return value can be freed with destroy_bam_iter_data.
*
*/
mplp_data *create_bam_iter_data(
const bam_fset* fset, const char *chr, int start, int end,
const char *read_group, const char tag_name[2], const int tag_value);
const char *read_group, const char tag_name[2], const int tag_value,
const int min_mapQ);

/** Clean up auxiliary bam reading data.
*
Expand Down
6 changes: 4 additions & 2 deletions src/counts.c
Original file line number Diff line number Diff line change
Expand Up @@ -458,6 +458,8 @@ bool query_mod_subtag(hts_base_mod_state *state, int qtype, int qcanonical, char
* @param threshold probability filter for excluding calls from counts.
* @param mb BAM code for modified base to report. (e.g. h for 5hmC), or a ChEBI code.
* @param combine combine all modified bases corresponding to same canonical base as mb
* @param max_depth maximum depth of pileup.
* @param min_mapQ minimum mapping quality of reads.
* @returns a pileup data pointer.
*
* The return value can be freed with destroy_plp_data.
Expand All @@ -466,7 +468,7 @@ bool query_mod_subtag(hts_base_mod_state *state, int qtype, int qcanonical, char
plp_data calculate_pileup(
const set_fsets *fsets, const char *chr, int start, int end,
const char *read_group, const char tag_name[2], const int tag_value,
int threshold, mod_base mb, bool combine, int max_depth) {
int threshold, mod_base mb, bool combine, int max_depth, int min_mapQ) {

static bool shown_second_strand_warning = false;

Expand All @@ -480,7 +482,7 @@ plp_data calculate_pileup(
mplp_data **data = xalloc(fsets->n, sizeof(mplp_data*), "bam files");
for (size_t i = 0; i < nfile; ++i) {
data[i] = create_bam_iter_data(
fsets->fsets[i], chr, start, end, read_group, tag_name, tag_value);
fsets->fsets[i], chr, start, end, read_group, tag_name, tag_value, min_mapQ);
if (data[i] == NULL) {
// TODO: clean-up all j<i data[i], and free data
return NULL;
Expand Down
3 changes: 2 additions & 1 deletion src/counts.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ void print_bedmethyl(
* @param mod_base a mod_base instance
* @param combine combine all modified bases corresponding to same canonical base as mb
* @param max_depth maximum depth of pileup.
* @param min_mapQ
* @returns a pileup data pointer.
*
* The return value can be freed with destroy_plp_data.
Expand All @@ -168,6 +169,6 @@ void print_bedmethyl(
plp_data calculate_pileup(
const set_fsets *fsets, const char *chr, int start, int end,
const char *read_group, const char tag_name[2], const int tag_value,
int threshold, mod_base mb, bool combine, int max_depth);
int threshold, mod_base mb, bool combine, int max_depth, int min_mapQ);

#endif
4 changes: 2 additions & 2 deletions src/modbam2bed.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ void *pileup_worker(void *arg) {
files, j.chr, j.start, j.end,
j.args.read_group, j.args.tag_name, j.args.tag_value,
j.args.threshold, j.args.mod_base, j.args.combine,
j.args.hts_maxcnt);
j.args.hts_maxcnt, j.args.min_mapQ);
destroy_filesets(files);
free(arg);
return pileup;
Expand All @@ -58,7 +58,7 @@ void process_region(arguments_t args, const char *chr, int start, int end, char
args.bam, chr, start, end,
args.read_group, args.tag_name, args.tag_value,
args.threshold, args.mod_base, args.combine,
args.hts_maxcnt);
args.hts_maxcnt, args.min_mapQ);
if (pileup == NULL) return;

init_output_buffers(bed_files);
Expand Down
2 changes: 1 addition & 1 deletion src/version.h
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
// remember to bump version in modbampy/__init__.py too
const char *argp_program_version = "0.9.0";
const char *argp_program_version = "0.9.1";

0 comments on commit a3db13b

Please sign in to comment.