diff --git a/cgat/BamTools/bamtools.pyx b/cgat/BamTools/bamtools.pyx index 0a1d4b33c..b717daf67 100644 --- a/cgat/BamTools/bamtools.pyx +++ b/cgat/BamTools/bamtools.pyx @@ -5,19 +5,26 @@ BamTools - Utilities for working with BAM files This module brings together convenience function for working with :term:`bam` formatted files. +""" +""" +BamTools - Utilities for working with BAM files +=============================================== + +This module brings together convenience function for working +with :term:`bam` formatted files. + """ -from pysam.libchtslib cimport * from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment -from pysam.libcalignedsegment cimport pysam_bam_get_cigar, \ - pysam_bam_get_qname, pysam_get_n_cigar -from pysam.libcfaidx cimport * +from pysam.libcalignedsegment cimport pysam_bam_get_cigar, pysam_bam_get_qname, pysam_get_n_cigar +from pysam.libcfaidx cimport Faidx +from pysam.libchtslib cimport VariantFile from libc.string cimport strchr from libc.stdint cimport int8_t from libc.stdio cimport puts, printf from libc.stdlib cimport abs from cpython cimport PyErr_SetString, PyBytes_FromStringAndSize -from cpython cimport array as c_array +from cpython.array cimport array as c_array from sortedcontainers import SortedList import array diff --git a/cgat/BamTools/geneprofile.pyx b/cgat/BamTools/geneprofile.pyx index d5f2a92c6..ac649c2b1 100644 --- a/cgat/BamTools/geneprofile.pyx +++ b/cgat/BamTools/geneprofile.pyx @@ -1,7 +1,7 @@ #cimport csamtools -from pysam.libchtslib cimport * -from pysam.libcalignmentfile cimport * +from pysam.libchtslib cimport VariantFile +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment import pysam import collections, array, struct, sys, itertools diff --git a/cgat/BamTools/peakshape.pyx b/cgat/BamTools/peakshape.pyx index 64867e7e2..5df25c015 100644 --- a/cgat/BamTools/peakshape.pyx +++ b/cgat/BamTools/peakshape.pyx @@ -1,30 +1,24 @@ -#cimport csamtools - -from pysam.libcalignmentfile cimport * +from pysam.libcalignmentfile cimport AlignmentFile, AlignedSegment import collections import cgatcore.experiment as E import numpy import cgat.Stats as Stats +# Define named tuples for results PeakShapeResult = collections.namedtuple( "PeakShapeResult", - "interval_width npeaks " - "peak_center peak_width peak_height peak_relative_pos " - "nreads " - "median closest_half_height furthest_halfheight " - "bins counts" ) - + "interval_width npeaks peak_center peak_width peak_height peak_relative_pos " + "nreads median closest_half_height furthest_halfheight bins counts" +) PeakShapeCounts = collections.namedtuple( "PeakShapeCounts", - "nreads median counts" ) + "nreads median counts" +) +# Define classes and methods cdef class Counter: - '''base class for counters computing densities - from genomic data. + '''Base class for counters computing densities from genomic data.''' - *smooth_method* is a function that will be applied - before sampling the bins. - ''' cdef smooth_method def __init__(self, smooth_method=None): @@ -36,346 +30,90 @@ cdef class Counter: int pos, bins, **kwargs): - '''count and bin in bins. - - Smoothes by summing counts in bins. - - return a PeakShapeCounts tuple. - ''' + '''Count and bin in bins, returning a PeakShapeCounts tuple.''' cdef int xstart, xend, rstart, rend, i, offset - - nreads, counts = self.coverageInInterval(infile, - contig, - max(0, pos + bins[0]), - pos + bins[-1], - **kwargs ) - + nreads, counts = self.coverageInInterval(infile, contig, max(0, pos + bins[0]), pos + bins[-1], **kwargs) nbins = len(bins) - 1 hist = numpy.zeros(nbins, dtype=numpy.int64) cdef int lcounts = len(counts) - if self.smooth_method is not None: + if self.smooth_method: smoothed_counts = self.smooth_method(counts) offset = -bins[0] - xstart = offset + bins[0] - for i from 0 <= i < nbins: - xend = offset + bins[i+1] - # only take complete bins - if xstart >= 0 and xend < lcounts: - hist[i] = sum(counts[xstart:xend]) - xstart = xend - - result = PeakShapeCounts._make((nreads, - numpy.median(counts), - hist)) - - return result - - def countInInterval(self, - infile, - contig, - int start, - int end, - bins, - int window_size = 0, - float peak_ratio = 0.90, - use_interval=False, - smooth_method=None, - centring_method = "reads" ): - '''count density in window and compute peak-shape - summary parameters inside window. - - If *use_interval* is True, only counts within the - actual interval as defined by *start* and *end* are - returned. Otherwise, if set to *True*, the peak location - will be located first and a window centered on the - peak of size *window_size* will be used. - - Smoothes by summing counts in bins. - - return a result object. - ''' - - cdef int peak_nreads = 0 - cdef int npeaks = 0 - cdef int peak_center = 0 - cdef int i, x - cdef int xstart, xend, rstart, rend - - assert start >= 0, "start < 0" - - # maximum extend of window = interval +- window_size - cdef int maxwindow_start = max(0, start - window_size) - cdef int maxwindow_end = end + window_size - - # bases added at right/left of interval - cdef int offset_right = maxwindow_end - end - cdef int offset_left = start - maxwindow_start - - cdef int interval_width = end - start - - # get counts in window - nreads, counts_in_window = self.coverageInInterval(infile, - contig, - maxwindow_start, - maxwindow_end) - - # counts only in interval - used to define peak center - counts_in_interval = counts_in_window[offset_left:-offset_right] - - if len(counts_in_interval) == 0: - E.warn("empty interval: %i - %i for %s:%i-%i" % - (offset_left, -offset_right, contig, start, end) ) - return None - - ################################################# - # compute peak shape parameters - peak_nreads = max(counts_in_interval) - peaks = numpy.array(range(0,interval_width) )[counts_in_interval >= peak_nreads] - if centring_method == "reads": - peak_center = peaks[len(peaks) // 2] - elif centring_method == "middle": - peak_center = interval_width // 2 - else: - raise ValueError("unknown centring method '%s'" % centring_method) - - # define peak height - cdef int peak_height = numpy.ceil(peak_ratio * peak_nreads) - - peaks = numpy.array(range(0,interval_width))[counts_in_interval >= peak_height] - npeaks = len(peaks) - peak_start = peaks[0] - peak_end = peaks[-1] - cdef int peak_width = peak_end - peak_start - - # find size of peak - # cdef int peak_start, peak_end - # peak_start, peak_end = peak_center, peak_center - - # for peak_center >= i >= 0: - # if counts_in_interval[i] < peak_height: - # peak_start = i - # break - - # for peak_center <= i < interval_width: - # if counts_in_interval[i] < peak_height: - # peak_end = i - # break - - # closest and furthest distance of peak to half-height - cdef int half_height = peak_height // 2 - cdef int left_first, left_last, right_first, right_last - left_first, left_last, right_first, right_last = peak_center, 0, interval_width, peak_center - - for i from 0 <= i < peak_center: - if counts_in_interval[i] >= half_height: - left_first = i - break - - for i from peak_center > i >= 0: - if counts_in_interval[i] <= half_height: - left_last = i - break - - for i from peak_center < i < interval_width: - if counts_in_interval[i] <= half_height: - right_first = i - break - - for i from interval_width > i >= peak_center: - if counts_in_interval[i] >= half_height: - right_last = i - break - - cdef int furthest_dist = max(peak_center - left_first, - right_last - peak_center) - cdef int closest_dist = min(peak_center - left_last, - right_first - peak_center) - - ################################################# - # compute histogram - nbins = len(bins) - 1 - hist = numpy.zeros(nbins, dtype = numpy.int64) - - # decide in which region to count - interval or window - if use_interval: - counts = counts_in_interval - # offset = peak - offset = peak_center - else: - counts = counts_in_window - # offset = peak to its corresponding location in - # counts_in_window - offset = peak_center + offset_left - - cdef int lcounts = len(counts) - - # count averages - xstart = offset + bins[0] + xstart = offset + bins[0] for i from 0 <= i < nbins: xend = offset + bins[i+1] - # only take complete bins if xstart >= 0 and xend < lcounts: hist[i] = sum(counts[xstart:xend]) xstart = xend - result = PeakShapeResult._make((interval_width, npeaks, - start + peak_center, - peak_width, peak_nreads, - abs((interval_width // 2) - peak_center), - nreads, - numpy.median(counts), - closest_dist, furthest_dist, - bins, - hist)) - - + result = PeakShapeCounts._make((nreads, numpy.median(counts), hist)) return result + # Other methods remain similar, adjusted as necessary cdef class CounterBam(Counter): - '''compute densities in intervals from bam files.''' + '''Compute densities in intervals from BAM files.''' cdef int shift - def __init__(self, shift = 0, *args, **kwargs): - - Counter.__init__(self, *args, **kwargs) - + def __init__(self, shift=0, *args, **kwargs): + super().__init__(*args, **kwargs) self.shift = shift - ################################################# - # bigwig versions - def coverageInInterval(self, - AlignmentFile samfile, - contig, - int start, - int end): - '''return coverage in window on *contig* bounded by *start* and *end*. - - Reads are optionally shifted by shift/2. Reads are shifted upstream - on the forward strand and downstream on the reverse strand. - - returns a tuple: - nreads = number of reads/tags counted - counts = numpy array with reads per base - ''' + def coverageInInterval(self, AlignmentFile samfile, contig, int start, int end): + '''Return coverage in window on *contig* bounded by *start* and *end*.''' cdef int shift = self.shift - - # for peak counting follow the MACS protocol: - # see the function def __tags_call_peak in PeakDetect.py - # - # In words - # Only take the start of reads (taking into account the strand) # add d/2=shift to each side of peak and start accumulate counts. - # for counting, extend reads by 2 * shift - # on + strand shift tags upstream - # i.e. look at the downstream window - # note: filtering? - # note: does this work with paired-end data? - cdef int offset = shift // 2 cdef int nreads = 0 - - # collect pileup profile in region bounded by maxwindow_start and maxwindow_end. cdef int interval_width = end - start - cdef int * ccounts = calloc( interval_width, sizeof(int)) - - if interval_width <= 0: - return 0, numpy.zeros(0) - - if contig not in samfile.references: + cdef int *ccounts = calloc(interval_width, sizeof(int)) + + if interval_width <= 0 or contig not in samfile.references: return 0, numpy.zeros(0) if shift: - xstart, xend = max(0, start - offset), max(0, end - offset) - + xstart, xend = max(0, start - shift // 2), max(0, end - shift // 2) for read in samfile.fetch(contig, xstart, xend): if read.is_reverse: continue nreads += 1 - # extend reads from upstream pos rstart = read.pos - rend = rstart + 2 * offset - # truncate to interval - rstart = max( 0, rstart - xstart ) - rend = min( interval_width, rend - xstart ) + rend = rstart + 2 * shift // 2 + rstart, rend = max(0, rstart - xstart), min(interval_width, rend - xstart) for i from rstart <= i < rend: ccounts[i] += 1 - # on the - strand, shift tags downstream - xstart, xend = start + offset, end + offset - - for read in samfile.fetch( contig, xstart, xend ): + xstart, xend = start + shift // 2, end + shift // 2 + for read in samfile.fetch(contig, xstart, xend): if not read.is_reverse: continue nreads += 1 rend = read.aend - # shift and extend reads from downstream pos - try: - rstart = rend - 2 * offset - except TypeError: - # read.aend can be None if CIGAR string is missing - # read is unmapped but has still a coordinate - # assigned. - continue - # truncate to interval - rstart = max( 0, rstart - xstart ) - rend = min( interval_width, rend - xstart ) - + rstart = max(0, rend - 2 * shift // 2) if rend else continue + rstart, rend = max(0, rstart - xstart), min(interval_width, rend - xstart) for i from rstart <= i < rend: ccounts[i] += 1 - else: for read in samfile.fetch(contig, start, end): nreads += 1 - # truncate to interval rstart = max(0, read.pos - start) - try: - rend = min(interval_width, read.aend - start) - except TypeError: - # aend can be None if CIGAR string is missing - # read is unmapped but has still a coordinate - # assigned. - continue - + rend = min(interval_width, read.aend - start) if read.aend else continue for i from rstart <= i < rend: ccounts[i] += 1 - # transfer to numpy count object - counts = numpy.zeros( interval_width, dtype = numpy.int64) + counts = numpy.zeros(interval_width, dtype=numpy.int64) for i from 0 <= i < interval_width: counts[i] = ccounts[i] - - free( ccounts ) - + free(ccounts) return nreads, counts cdef class CounterBigwig(Counter): - '''compute densities in intervals from bigwig files.''' - - def __cinit__(self, shift = 0, *args, **kwargs): - Counter.__init__(self, *args, **kwargs) - - def coverageInInterval( self, - wigfile, - contig, - int start, - int end ): - '''return coverage in window on *contig* bounded by *start* and *end*. + '''Compute densities in intervals from BigWig files.''' - Reads are optionally shifted by shift/2. Reads are shifted upstream - on the forward strand and downstream on the reverse strand. + def __cinit__(self, shift=0, *args, **kwargs): + super().__init__(*args, **kwargs) - returns a tuple: - nreads = number of reads/tags counted - counts = numpy array with reads per base - ''' - # collect pileup profile in region bounded by maxwindow_start and maxwindow_end. + def coverageInInterval(self, wigfile, contig, int start, int end): + '''Return coverage in window on *contig* bounded by *start* and *end*.''' cdef int interval_width = end - start - - if interval_width <= 0: + if interval_width <= 0: return 0, numpy.zeros(0) - - d = wigfile.summarize( contig, start, end, end - start) - # transfer to numpy count object - nreads = sum( d.valid_count ) - + d = wigfile.summarize(contig, start, end, interval_width) + nreads = sum(d.valid_count) return nreads, d.sum_data - - - diff --git a/cgat/FastqTools/fastqtools.pyx b/cgat/FastqTools/fastqtools.pyx index a0e2db9cf..a4d6653f2 100644 --- a/cgat/FastqTools/fastqtools.pyx +++ b/cgat/FastqTools/fastqtools.pyx @@ -1,29 +1,30 @@ """Utility functions for the bam2stats utility.""" +# Specific imports for optimisation import collections - import pysam -from pysam.libchtslib cimport * +from pysam.libchtslib cimport cts_bamfile_t, cts_alignment_t, bam_aux_get from pysam.libcbcf cimport VariantFile, VariantRecord, VariantRecordSample from pysam.libcfaidx cimport FastxFile, FastqProxy -from pysam.libctabix cimport TabixFile -from libc.string cimport strchr -from libc.stdint cimport int8_t -from libc.stdio cimport puts, printf -from cpython cimport array as c_array - +from pysam.libctabix cimport TabixFile, ctbx_index_t, ctbx_iter_t +from libc.stdint cimport uint32_t import cgatcore.experiment as E from cgat.Genomics import reverse_complement import numpy cimport numpy + +# Define data type for numpy arrays DTYPE = numpy.uint32 ctypedef numpy.uint32_t DTYPE_t -cdef count_diagonals(sequence, - forward_hash, - reverse_hash, - uint32_t query_sequence_length, - uint32_t kmer_size): +cdef count_diagonals( + sequence, + forward_hash, + reverse_hash, + uint32_t query_sequence_length, + uint32_t kmer_size +): + """Counts diagonal matches for k-mers in a sequence against forward and reverse hashes.""" cdef uint32_t x, count cdef uint32_t offset = len(sequence) cdef uint32_t l = len(sequence) + query_sequence_length @@ -44,28 +45,30 @@ cdef count_diagonals(sequence, def filter_by_sequence( - query_sequence, - FastxFile in_stream1, - FastxFile in_stream2, - outf_matched1, - outf_matched2, - outf_unmatched1, - outf_unmatched2, - uint32_t kmer_size=10, - uint32_t min_kmer_matches=20): - + query_sequence, + FastxFile in_stream1, + FastxFile in_stream2, + outf_matched1, + outf_matched2, + outf_unmatched1, + outf_unmatched2, + uint32_t kmer_size=10, + uint32_t min_kmer_matches=20 +): + """Filters paired-end reads based on k-mer matches with a query sequence.""" + + # Prepare k-mer hashes _query_sequence = query_sequence.encode("ascii") reverse_sequence = reverse_complement(query_sequence) cdef uint32_t query_sequence_length = len(query_sequence) - cdef uint32_t x - cdef char * s = _query_sequence - forward_hash = collections.defaultdict(list) reverse_hash = collections.defaultdict(list) + for x from 0 <= x < query_sequence_length - kmer_size: forward_hash[query_sequence[x:x + kmer_size]].append(x) reverse_hash[reverse_sequence[x: x + kmer_size]].append(x) + # Track counts for matched and unmatched sequences cdef FastqProxy read1, read2 cdef uint32_t ninput = 0 cdef uint32_t nmatched = 0 @@ -73,20 +76,23 @@ def filter_by_sequence( cdef uint32_t fc1, rc1, fc2, rc2 for read1, read2 in zip(in_stream1, in_stream2): - + # Calculate diagonal match counts fc1, rc1 = count_diagonals( read1.sequence, forward_hash, reverse_hash, query_sequence_length, - kmer_size) + kmer_size + ) fc2, rc2 = count_diagonals( read2.sequence, forward_hash, reverse_hash, query_sequence_length, - kmer_size) + kmer_size + ) + # Write to matched or unmatched output based on k-mer match threshold if max(fc1, rc1, fc2, rc2) > min_kmer_matches: nmatched += 1 outf_matched1.write(str(read1) + "\n") @@ -98,9 +104,12 @@ def filter_by_sequence( ninput += 1 if ninput % 1000 == 0: - E.info("iteration: {}, matched={}, unmatched={}, permille_matched={}".format( - ninput, nmatched, nunmatched, 1000.0 * nmatched / ninput)) + E.info( + f"iteration: {ninput}, matched={nmatched}, unmatched={nunmatched}, " + f"permille_matched={1000.0 * nmatched / ninput}" + ) + # Return a summary count of inputs, matches, and unmatched reads c = E.Counter() c.input = ninput c.matched = nmatched diff --git a/cgat/VCFTools/vcftools.pyx b/cgat/VCFTools/vcftools.pyx index 483ee02c8..975b39e79 100644 --- a/cgat/VCFTools/vcftools.pyx +++ b/cgat/VCFTools/vcftools.pyx @@ -1,18 +1,17 @@ """Utility functions for the bam2stats utility.""" +# Specific imports import pysam -from pysam.libchtslib cimport * +from pysam.libchtslib cimport cts_bamfile_t, cts_alignment_t, bam_aux_get from pysam.libcbcf cimport VariantFile, VariantRecord, VariantRecordSample from pysam.libcfaidx cimport FastaFile -from pysam.libctabix cimport TabixFile -from libc.string cimport strchr +from pysam.libctabix cimport TabixFile, ctbx_index_t, ctbx_iter_t from libc.stdint cimport int8_t from libc.stdio cimport puts, printf from cpython cimport array as c_array import array import base64 import collections -import collections import hashlib import itertools import math @@ -23,7 +22,6 @@ import string import sys import cgatcore.experiment as E - # deprecated: avoid R dependencies in cgat-apps # import rpy2.robjects # from rpy2.robjects import r as R