diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 8edc91d..6d6be33 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -59,6 +59,7 @@ bld:api-test: - *prep-image - *minimal-python - make test_api + - make test_python deploy-checks: diff --git a/CHANGELOG.md b/CHANGELOG.md index 7244ba4..71b0c2e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,12 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [v0.9.1] +### Added +- `--map_q` command line option to filter reads by minimum mapping quality. +### Fixed +- The default mapping quality was erroneously 1 not 0. + ## [v0.9.0] ### Added - "Other" modified base column to extended output. For example, when using diff --git a/build.py b/build.py index aa1e2b4..68de6ce 100644 --- a/build.py +++ b/build.py @@ -82,7 +82,7 @@ mplp_data *create_bam_iter_data( const bam_fset* fset, const char *chr, int start, int end, - const char *read_group, const char tag_name[2], const int tag_value); + const char *read_group, const char tag_name[2], const int tag_value, const int min_mapq); void destroy_bam_iter_data(mplp_data *data); // iterate a file int read_bam(void *data, bam1_t *b); diff --git a/modbampy/__init__.py b/modbampy/__init__.py index c8eaaf9..1480de4 100644 --- a/modbampy/__init__.py +++ b/modbampy/__init__.py @@ -8,7 +8,7 @@ import libmodbampy # remember to bump version in src/version.h too -__version__ = "0.9.0" +__version__ = "0.9.1" ffi = libmodbampy.ffi libbam = libmodbampy.lib @@ -49,8 +49,9 @@ class ModBase: """ def __init__(self, code, base=None, name="unknown", abbrev="unknown"): + """Initialise the instance.""" self._name = ffi.new("char[]", name.encode()) - self._abbrev= ffi.new("char[]", abbrev.encode()) + self._abbrev = ffi.new("char[]", abbrev.encode()) self._base = base err = TypeError( "'base' should be a single character or None") @@ -58,7 +59,7 @@ def __init__(self, code, base=None, name="unknown", abbrev="unknown"): if len(self._base) != 1: raise err self._base = base.encode() - self._base_i = {"A":1, "C":2, "G":4, "T":8}[base] + self._base_i = {"A": 1, "C": 2, "G": 4, "T": 8}[base] elif self._base is not None: raise err @@ -76,6 +77,7 @@ def __init__(self, code, base=None, name="unknown", abbrev="unknown"): @property def struct(self): + """Return a list compatible with C structure.""" for i in range(libbam.n_mod_bases): if libbam.mod_bases[i].code == self._code: return libbam.mod_bases[i] @@ -114,7 +116,7 @@ def __exit__(self, type, value, traceback): def reads( self, chrom, start, end, - read_group=None, tag_name=None, tag_value=None): + read_group=None, tag_name=None, tag_value=None, min_mapq=0): """Iterate over (filtered) alignments in file. :param chrom: reference sequence from BAM. @@ -123,6 +125,7 @@ def reads( :param read group: read group of read to return. :param tag_name: read tag to check during read filtering. :param tag_value: tag value for reads to keep. + :param min_mapq: minimum read mapping quality. """ read_group, tag_name, tag_value = _tidy_args( read_group, tag_name, tag_value) @@ -130,7 +133,7 @@ def reads( data = ffi.gc( libbam.create_bam_iter_data( self._bam_fset, chrom.encode(), start, end, - read_group, tag_name, tag_value), + read_group, tag_name, tag_value, min_mapq), libbam.destroy_bam_iter_data) mod_state = ffi.gc( libbam.hts_base_mod_state_alloc(), @@ -144,7 +147,8 @@ def pileup( self, chrom, start, end, read_group=None, tag_name=None, tag_value=None, low_threshold=0.33, high_threshold=0.66, threshold=0.66, - mod_base="m", max_depth=None, canon_base=None, combine=False): + mod_base="m", max_depth=None, canon_base=None, combine=False, + min_mapq=0): """Create a base count matrix. :param chrom: reference sequence from BAM. @@ -153,14 +157,16 @@ def pileup( :param read group: read group of read to return. :param tag_name: read tag to check during read filtering. :param tag_value: tag value for reads to keep. - :param threshold: probability filter threshold for excluding calls from counts. + :param threshold: probability filter threshold for excluding + calls from counts. :param mod_base: ChEBI code of modified base to examine. :param max_depth: maximum read depth to examine. :param canon_base: canonical base corresponding to `mod_base`. - Required only if `mod_base is not a modification known to + Required only if `mod_base` is not a modification known to the code. :param combine: combine (include) all alternative modifications with the same parent canonical base. + :param min_mapq: minimum read mapping quality. """ for thresh in (low_threshold, high_threshold): if thresh < 0.0 or thresh > 1.0: @@ -181,7 +187,7 @@ def pileup( fsets, chrom.encode(), start, end, read_group, tag_name, tag_value, threshold, mod_base.struct, - combine, max_depth) + combine, max_depth, min_mapq) # TODO: check for NULL # copy data to numpy, we could be more clever here an wrap diff --git a/src/args.c b/src/args.c index b2804e5..f729421 100644 --- a/src/args.c +++ b/src/args.c @@ -79,6 +79,8 @@ static struct argp_option options[] = { "Only process reads with a given tag value.", 3}, {"haplotype", 0x300, "VAL", 0, "Only process reads from a given haplotype. Equivalent to --tag_name HP --tag_value VAL.", 3}, + {"map_q", 0x900, "QUAL", 0, + "Filter reads below this mapping quality.", 3}, { 0 } }; @@ -169,6 +171,9 @@ static error_t parse_opt (int key, char *arg, struct argp_state *state) { tag_items += 2; hp_given = true; break; + case 0x900: + arguments->min_mapQ = atoi(arg); + break; case 't': arguments->threads = atoi(arg); break; @@ -232,6 +237,7 @@ arguments_t parse_arguments(int argc, char** argv) { args.prefix = "mod-counts"; args.pileup = false; args.hts_maxcnt = INT_MAX; + args.min_mapQ = 0; argp_parse(&argp, argc, argv, 0, 0, &args); // allow CpG only for C! if (args.cpg || args.chh || args.chg) { diff --git a/src/args.h b/src/args.h index 6de36d8..86e35cb 100644 --- a/src/args.h +++ b/src/args.h @@ -25,6 +25,7 @@ typedef struct arguments { char* prefix; bool pileup; int hts_maxcnt; + int min_mapQ; } arguments_t; arguments_t parse_arguments(int argc, char** argv); diff --git a/src/bamiter.c b/src/bamiter.c index a758597..272a7ba 100644 --- a/src/bamiter.c +++ b/src/bamiter.c @@ -70,7 +70,8 @@ void destroy_filesets(set_fsets *s) { */ mplp_data *create_bam_iter_data( const bam_fset* bam_set, const char *chr, int start, int end, - const char *read_group, const char tag_name[2], const int tag_value) { + const char *read_group, const char tag_name[2], const int tag_value, + const int min_mapQ) { // open bam etc. // this is all now deferred to the caller @@ -96,7 +97,7 @@ mplp_data *create_bam_iter_data( data->fp = fp; data->idx = idx; data->hdr = hdr; data->iter = bam_itr_queryi(idx, mytid, start, end); memcpy(data->tag_name, tag_name, 2); data->tag_value = tag_value; - data->min_mapQ = 1; data->read_group = read_group; + data->min_mapQ = min_mapQ; data->read_group = read_group; return data; } diff --git a/src/bamiter.h b/src/bamiter.h index a9eebf3..42913e9 100644 --- a/src/bamiter.h +++ b/src/bamiter.h @@ -52,13 +52,15 @@ void destroy_filesets(set_fsets *s); * @param read_group by which to filter alignments. * @param tag_name by which to filter alignments. * @param tag_value associated with tag_name. + * @param min_mapQ minimum mapping quality of reads. * * The return value can be freed with destroy_bam_iter_data. * */ mplp_data *create_bam_iter_data( const bam_fset* fset, const char *chr, int start, int end, - const char *read_group, const char tag_name[2], const int tag_value); + const char *read_group, const char tag_name[2], const int tag_value, + const int min_mapQ); /** Clean up auxiliary bam reading data. * diff --git a/src/counts.c b/src/counts.c index c3c9932..fc6315d 100644 --- a/src/counts.c +++ b/src/counts.c @@ -458,6 +458,8 @@ bool query_mod_subtag(hts_base_mod_state *state, int qtype, int qcanonical, char * @param threshold probability filter for excluding calls from counts. * @param mb BAM code for modified base to report. (e.g. h for 5hmC), or a ChEBI code. * @param combine combine all modified bases corresponding to same canonical base as mb + * @param max_depth maximum depth of pileup. + * @param min_mapQ minimum mapping quality of reads. * @returns a pileup data pointer. * * The return value can be freed with destroy_plp_data. @@ -466,7 +468,7 @@ bool query_mod_subtag(hts_base_mod_state *state, int qtype, int qcanonical, char plp_data calculate_pileup( const set_fsets *fsets, const char *chr, int start, int end, const char *read_group, const char tag_name[2], const int tag_value, - int threshold, mod_base mb, bool combine, int max_depth) { + int threshold, mod_base mb, bool combine, int max_depth, int min_mapQ) { static bool shown_second_strand_warning = false; @@ -480,7 +482,7 @@ plp_data calculate_pileup( mplp_data **data = xalloc(fsets->n, sizeof(mplp_data*), "bam files"); for (size_t i = 0; i < nfile; ++i) { data[i] = create_bam_iter_data( - fsets->fsets[i], chr, start, end, read_group, tag_name, tag_value); + fsets->fsets[i], chr, start, end, read_group, tag_name, tag_value, min_mapQ); if (data[i] == NULL) { // TODO: clean-up all j