From 9a791b559db79ce506e7c55be6ecb0c1c9a15258 Mon Sep 17 00:00:00 2001 From: Kaitlyn Wade <48563808+kwade4@users.noreply.github.com> Date: Fri, 15 Nov 2019 16:47:00 -0500 Subject: [PATCH] -Refactoring to address incorrect coordinates (#20) -Modified to be more cohesive and object-oriented (in progress) --- poplars/sequence_locator.py | 638 +++++++++++++++++------------------- 1 file changed, 293 insertions(+), 345 deletions(-) diff --git a/poplars/sequence_locator.py b/poplars/sequence_locator.py index 75eb90b..13fd6b2 100644 --- a/poplars/sequence_locator.py +++ b/poplars/sequence_locator.py @@ -13,41 +13,35 @@ import textwrap -class GenomeRegion: +class Region: """ - Represents information about a genomic region + Stores information about a genome region """ - - def __init__(self, region_name, ncoords=None, nt_seq=None, pcoords=None, aa_seq=None): + def __init__(self, region_name, genome, ncoords=None, pcoords=None): """ - Stores information about each genomic region :param region_name: The name of the genomic region + :param genome: A reference to a genome object :param ncoords: A list containing the global start and end indices of the nucleotide region - :param nt_seq: The nucleotide sequence :param pcoords: A list containing the global start and end indices of the protein region - :param aa_seq: The amino acid sequence """ self.region_name = region_name + self.genome = genome self.ncoords = ncoords - self.nt_seq = nt_seq self.pcoords = pcoords - self.aa_seq = aa_seq - self.cds_offset, self.gstart, self.qstart, self.pstart = [], [], [], [] - self.codon_aln = '' + self.nt_seq = self.set_nt_seq_from_genome() + self.aa_seq = None def get_coords(self, base): if base == 'nucl': - coords = self.ncoords + return self.ncoords else: - coords = self.pcoords - return coords + return self.pcoords def get_sequence(self, base): if base == 'nucl': - sequence = self.nt_seq + return self.nt_seq else: - sequence = self.aa_seq - return sequence + return self.aa_seq def set_coords(self, coord_pair, base): if base == 'nucl': @@ -55,30 +49,142 @@ def set_coords(self, coord_pair, base): else: self.pcoords = coord_pair - def set_sequence(self, seq, base): + def set_nt_seq_from_genome(self): + return self.genome.nt_seq[self.ncoords[0] - 1: self.ncoords[1]] + + def set_sequence(self, base, seq): if base == 'nucl': self.nt_seq = seq else: self.aa_seq = seq - def set_seq_from_ref(self, sequence, base): + +class RefRegion(Region): + """ + Stores information about each region in the reference genome + """ + def __init__(self, region_name, genome, ncoords=None, pcoords=None): + super().__init__(region_name, genome, ncoords, pcoords) + self.codon_aln = '' + + def make_codon_aln(self): + """ + Aligns the protein sequence with its associated nucleotide sequence + """ + if self.nt_seq is not None and self.aa_seq is not None: + codon_aln = [] + codons = [''.join(t) for t in zip(*[iter(self.nt_seq)] * 3)] + for aa, codon in zip(self.aa_seq, codons): + # Check if stop codon + if codon == 'TAA' or codon == 'TGA' or codon == 'TAG': + codon_aln.append('-*-') + else: + codon_aln.append('-{}-'.format(aa)) + self.codon_aln = ''.join(codon_aln) + return self.codon_aln + + def find_overlap(self, base, q_coords): + """ + Gets the sequence regions that overlap with the region of interest + :param base: the base of the sequence (nucleotide or protein) + :param q_coords: the coordinates of the query region (global coordinates) + :return overlap: A QueryRegion object that represents the where the query overlaps with the reference region + """ + start, end = 0, 0 + ref_coords = self.get_coords(base) # Coordinates are global coordinates + + # If the ref_coords are in the range of the query region and the q_coords are in the range of the ref region + if not ((ref_coords[0] < ref_coords[1] < q_coords[0]) or (q_coords[0] < q_coords[1] < ref_coords[0])): + + # If the end of the query region exceeds the end of the reference region + if q_coords[1] > ref_coords[1]: + end = ref_coords[1] + else: + end = q_coords[1] + + # If the query region starts before the reference region starts + if q_coords[0] < ref_coords[0]: + start = ref_coords[0] + else: + start = q_coords[0] + + overlap = QueryRegion(self.region_name, self, base, self.genome) + + # Set the nucleotide and protein regions if base == 'nucl': - self.nt_seq = sequence[self.ncoords[0] - 1: self.ncoords[1]] + prot_overlap = self.set_protein_equivalents(overlap) + overlap.set_coords(q_coords, 'nucl') + overlap.set_sequence('nucl', self.get_sequence('nucl')[start: end]) + overlap.set_coords(prot_overlap[1], 'prot') + overlap.set_sequence(prot_overlap[0], 'prot') else: - self.aa_seq = sequence[self.pcoords[0] - 1: self.pcoords[1]] + nucl_overlap = self.set_nucleotide_equivalents(overlap) + overlap.set_coords(q_coords, 'prot') + overlap.set_sequence('prot', self.get_sequence('prot')[start: end]) + overlap.set_coords(nucl_overlap[1], 'nucl') + overlap.set_sequence(nucl_overlap[0], 'nucl') - def set_pos_from_cds(self, ref_reg_name): + return overlap # return overlap object not dict + + def set_protein_equivalents(self, overlap): """ - Gives the position of a the query sequence relative to the start of the coding sequence + Finds the protein equivalent of the nucleotide sequence + :param overlap: the region that aligns with the reference region + :return aa_seq, aa_coords: the sequence and coordinates of the overlapping protein sequence + """ + non_coding = ["5'LTR", "TAR", "3'LTR"] + aa_seq, aa_coords = '', [] + if self.region_name not in non_coding: + + # Slice aligned nucleotide sequence using coordinates of the query region + if self.codon_aln is not None: + aligned_prot = self.codon_aln[overlap.ncoords[0]: overlap.ncoords[1]] + aa_seq = re.sub('[-]', '', aligned_prot) # Get corresponding protein sequence + aa_coords = [1, len(aa_seq)] + + return aa_seq, aa_coords + + def set_nucleotide_equivalents(self, overlap): """ + Finds the nucleotide equivalent of the protein sequence + :param overlap: the region that aligns with the reference region + :return nt_seq, nt_coords: the sequence and coordinates of the overlapping nucleotide sequence + """ + nt_seq, nt_coords = '', [] + if self.ncoords is not None: + overlap.make_codon_aln() + regex = re.compile(overlap.codon_aln) + coords = regex.search(self.codon_aln).span() + nt_coords = overlap.get_coords('nucl') + nt_seq = self.nt_seq[coords[0]: coords[1]] + + return nt_seq, nt_coords + - ref_region = GENOME_REGIONS[ref_reg_name] - if self.ncoords and ref_region.ncoords: +class QueryRegion(Region): + """ + Represents information about the query sequence region + """ + def __init__(self, region_name, ref_region, base, genome, ncoords=None, pcoords=None): + super().__init__(region_name, genome, ncoords, pcoords) + + self.ref_region = ref_region + self.codon_aln = '' + self.set_pos_from_cds() + self.set_pos_from_gstart() + self.set_pos_from_qstart(base) + self.set_pos_from_rstart(base) + self.set_pos_from_pstart() + def set_pos_from_cds(self): + """ + Gives the position of a the query sequence relative to the start of the coding sequence + """ + if self.ncoords and self.ref_region.ncoords: query_start = self.ncoords[0] query_end = self.ncoords[1] - ref_start = ref_region.ncoords[0] - ref_end = ref_region.ncoords[1] + ref_start = self.ref_region.ncoords[0] + ref_end = self.ref_region.ncoords[1] if query_start == ref_start: start = 1 @@ -95,20 +201,32 @@ def set_pos_from_cds(self, ref_reg_name): def set_pos_from_gstart(self): self.gstart = self.ncoords - def set_pos_from_qstart(self, q_coords, base): + def set_pos_from_qstart(self, base): """ Gives the position of the sequence relative to the start of the region of interest - :param q_coords: The coodinates of the query region :param base: The base of the sequence (nucleotide or protein) - :return: The position relative to the start of the region of interest + :return: The position of the reference region relative to the query region """ - r_coords = self.get_coords(base) - r_seq = self.get_sequence(base) - if r_coords is not None and r_seq is not None: - start_offset = r_coords[0] - q_coords[0] + 1 - end_offset = start_offset + len(r_seq) - 1 + q_coords = self.get_coords(base) + q_seq = self.ref_region.get_sequence(base) + if q_coords is not None and q_seq is not None: + start_offset = q_coords[0] - self.ref_region.get_coords(base)[0] + 1 + end_offset = start_offset + len(q_seq) - 1 self.qstart = [start_offset, end_offset] + def set_pos_from_rstart(self, base): + """ + Gives the position of the sequence relative to the start of the region of interest + :param base: The base of the sequence (nucleotide or protein) + :return: The position of the reference region relative to the query region + """ + q_coords = self.get_coords(base) + q_seq = self.ref_region.get_sequence(base) + if q_coords is not None and q_seq is not None: + start_offset = self.ref_region.get_coords(base)[0] + 1 - q_coords[0] + end_offset = len(q_seq) - 1 - start_offset + self.rstart = [start_offset, end_offset] + def set_pos_from_pstart(self): """ Gives the position of the sequence relative to the start of the protein sequence @@ -119,44 +237,6 @@ def set_pos_from_pstart(self): else: self.pstart = self.pcoords - def make_codon_aln(self): - """ - Aligns the protein sequence with its associated nucleotide sequence - """ - if self.nt_seq is not None and self.aa_seq is not None: - codon_aln = [] - codons = [''.join(t) for t in zip(*[iter(self.nt_seq)] * 3)] - for aa, codon in zip(self.aa_seq, codons): - # Check if stop codon - if codon == 'TAA' or codon == 'TGA' or codon == 'TAG': - codon_aln.append('-*-') - else: - codon_aln.append('-{}-'.format(aa)) - self.codon_aln = ''.join(codon_aln) - return self.codon_aln - - @staticmethod - def global_to_local_index(coord_pair): - """ - Converts a pair of global indices to local indices, relative to the sequence region of interest - """ - start = coord_pair[0] # 0-based inclusive indexing - start_offset = coord_pair[0] - start - end_offset = coord_pair[1] - start - local_pair = [start_offset, end_offset] - return local_pair - - @staticmethod - def local_to_global_index(r_coords, local_pair): - """ - Converts a pair of local indices (relative to the region of interest) to global indices - """ - if r_coords is not None: - global_start = r_coords[0] + local_pair[0] - 1 - global_end = r_coords[0] + local_pair[1] - 1 - global_pair = [global_start, global_end] - return global_pair - def set_pcoords_from_ncoords(self): """ Sets protein coordinates relative to the protein start, given the nucleotide coordinates @@ -177,53 +257,139 @@ def set_ncoords_from_pcoords(self): return [nucl_start, nucl_end] -def make_regions(nt_coords, nt_seq, aa_coords, aa_seq): +class Genome: """ - Reads in the start and end coordinates of the genomic regions and associates the region with its coordinates. - If no coordinate files are specified, set default nucleotide and protein coordinate files. - :param nt_coords: Path to the csv file containing the global coordinates of the nucleotide region. - The file stream has one genomic entry per line and has the following format: region_name,start,end - :param nt_seq: The nucleotide sequence - :param aa_coords: Path to the csv file containing the global coordinates of the protein region. - The file stream has one genomic entry per line and has the following format: region_name,start,end - :param aa_seq: A list of lists containing the protein sequences - :return genome_regions: A dictionary with keys as the region names and the values are GenomeRegion objects + Stores information about the reference genome """ + def __init__(self, nt_coords, nt_seq, aa_coords, aa_seq, reference_sequence): + self.nt_seq = nt_seq + # self.aa_seq = aa_seq # List of lists + self.ref_genome_regions = self.make_ref_regions(nt_coords, aa_coords, aa_seq) + self.reference_sequence = reference_sequence + + def make_ref_regions(self, nt_coords, aa_coords, aa_seq): + """ + Reads in the start and end coordinates of the genomic regions and associates the region with its coordinates. + If no coordinate files are specified, set default nucleotide and protein coordinate files. + :param nt_coords: File stream containing coordinates for nucleotide regions. Format: region_name,start,end + :param aa_coords: File stream containing coordinates for protein regions. Format: region_name,start,end + :param aa_seq: A list of lists containing the protein sequences + """ + ref_regions = {} + # Parse nucleotide region coordinates file + for nt_line in nt_coords: + nt_line = nt_line.strip() + nt_line = nt_line.split(',') + nucl_coords = [int(nt_line[1]), int(nt_line[2])] + seq_region = RefRegion(nt_line[0], self, nucl_coords) + ref_regions[nt_line[0]] = seq_region + + # Parse protein coordinates file + prot_names, prot_coords = [], [] + for aa_line in aa_coords: + aa_line = aa_line.strip() + aa_line = aa_line.split(',') + prot_names.append(aa_line[0]) + prot_coords.append([int(aa_line[1]), int(aa_line[2])]) + + # Match protein regions to nucleotide regions + for i, coords in enumerate(prot_coords): + for region_name in ref_regions: + if prot_names[i].startswith(ref_regions[region_name].region_name): + # Set protein coordinates and sequences + ref_regions[region_name].set_coords(coords, 'prot') + ref_regions[region_name].set_sequence('prot', aa_seq[i][1]) + + # Align nucleotide sequence with protein sequence + for name in ref_regions: + ref_regions[name].make_codon_aln() + + return ref_regions + + def sequence_align(self, query_sequence, outfile=None): + """ + Aligns the query sequence to the reference genome + :param query_sequence: The query sequence + :param outfile: