Skip to content

Commit

Permalink
-#8, #20 in progress
Browse files Browse the repository at this point in the history
  • Loading branch information
kwade4 committed Aug 15, 2019
1 parent c24bfb9 commit 4b58083
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 51 deletions.
50 changes: 28 additions & 22 deletions poplars/riplike.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,23 @@
from poplars.mafft import align


def hamming(fasta):
def hamming(bin_fasta):
"""
Convert list of lists into boolean outcomes (difference between query and reference)
:param fasta: object returned by align()
:param fasta: object returned by align() converted to bit strings
:return: dictionary of boolean lists keyed by reference label
"""
aln = dict(fasta)
assert "query" in aln, "Argument <fasta> must contain 'query' entry"
query = aln.pop('query')
_ = aln.pop('CON_OF_CONS')
query = bin_fasta.pop('query')

# iterate over remaining sequences as references
# Iterate over remaining sequences as references
results = {}
for h, s in aln.items():
for h, s in bin_fasta.items():
result = []
for i, nt1 in enumerate(query):
nt2 = s[i]
if nt1 == '-' or nt2 == '-':
for nt1, nt2 in zip(query, s):
if (nt1 | nt2) & 0B1111:
result.append(None)
continue
result.append(int(nt1 != nt2))
result.append(bin(nt1 ^ nt2))
results.update({h: result})

return results
Expand Down Expand Up @@ -61,17 +57,28 @@ def update_alignment(seq, reference):
return fasta2


def encode(sequence):
def encode(fasta):
"""
Encodes each nucleotide in a sequence using 4-bits
:param sequence: the sequence
:param fasta: the result of the alignment
:return: the sequence as a bitstring where each nucleotide is encoded using a 4-bits
"""
seq = []
binary_nt = {'A': 0B0001, 'T': 0B0010, 'C': 0B0011, 'G': 0B0100, ' ': 0B0000, '-': 0B1111}
for nt in sequence:
seq.append(binary_nt[nt])
return seq
bin_fasta = dict(fasta)
assert "query" in bin_fasta, "Argument <fasta> must contain 'query' entry"
_ = bin_fasta.pop('CON_OF_CONS')

binary_nt = {' ': 0B00000, 'A': 0B00001, 'T': 0B00010, 'C': 0B00011, 'G': 0B00100,
'N': 0B00101, 'R': 0B00110, 'Y': 0B00111, 'K': 0B01000, 'M': 0B01001,
'S': 0B01010, 'W': 0B01011, 'B': 0B01100, 'D': 0B01101, 'H': 0B01110,
'V': 0B01111, 'X': 0B10000, '-': 0B1111}

for h, s in bin_fasta.items():
seq = []
for nt in s:
seq.append(binary_nt[nt])
bin_fasta[h] = seq

return bin_fasta


def riplike(seq, reference, window=400, step=5, nrep=100):
Expand All @@ -89,7 +96,8 @@ def riplike(seq, reference, window=400, step=5, nrep=100):
fasta = update_alignment(seq, reference)
query = dict(fasta)['query'] # aligned query
seqlen = len(query)
ham = hamming(fasta)
bin_fasta = encode(fasta)
ham = hamming(bin_fasta)

for centre in range(window // 2, seqlen - (window // 2), step):
best_p, second_p = 1., 1. # maximum p-distance
Expand Down Expand Up @@ -193,5 +201,3 @@ def main():

if __name__ == '__main__':
main()


43 changes: 14 additions & 29 deletions poplars/sequence_locator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, region_name, ncoords=None, nt_seq=None, pcoords=None, aa_seq=
self.nt_seq = nt_seq
self.pcoords = pcoords
self.aa_seq = aa_seq
self.rel_pos = {'CDS': [], 'gstart': [], 'qstart': [], 'rstart': [], 'pstart': []}
self.rel_pos = {'CDS': [], 'gstart': [], 'qstart': [], 'pstart': []}
self.codon_aln = ''

def get_coords(self, base):
Expand Down Expand Up @@ -106,18 +106,20 @@ def set_pos_from_pstart(self, virus):
else:
self.rel_pos['pstart'] = None

def set_pos_from_rstart(self, region):
start_offset = self.ncoords[0] - region.ncoords[0]
end_offset = self.ncoords[1] - region.ncoords[1]
self.rel_pos['rstart'] = [end_offset, start_offset]

def set_pos_from_qstart(self, query, base):
if self.ncoords is None or self.pcoords is None:
self.set_coords(self.get_coords(base), base)

start_offset = self.ncoords[0] - query.ncoords[0]
end_offset = self.ncoords[1] - query.ncoords[1]
self.rel_pos['qstart'] = [end_offset, start_offset]
"""
Gives the position of the sequence relative to the start of the region of interest
:param query: The GenomeRegion objects for the query
:param base: The base of the sequence (nucleotide or protein)
:return: The position relative to the start of the region of interest
"""
r_coords = self.get_coords(base)
r_seq = self.get_sequence(base)
if r_coords is not None and r_seq is not None:
q_coords = query.get_coords(base)
start_offset = r_coords[0] - q_coords[0] + 1
end_offset = start_offset + len(r_seq)
self.rel_pos['qstart'] = [start_offset, end_offset]

def make_codon_aln(self):
"""
Expand Down Expand Up @@ -495,7 +497,6 @@ def find_matches(virus, base, ref_regions, match_coordinates):
query_region.set_pos_from_cds(virus)
query_region.set_pos_from_gstart()
query_region.set_pos_from_qstart(ref_region, base)
query_region.set_pos_from_rstart(ref_region)
query_region.set_pos_from_pstart(virus)

if base == 'nucl':
Expand Down Expand Up @@ -587,10 +588,6 @@ def output_retrieved_region(region, outfile=None):
print("\t\tPosition relative to query start:\t{} --> {}"
.format(region.rel_pos['qstart'][0], region.rel_pos['qstart'][1]))

if region.rel_pos['rstart']:
print("\t\tPosition relative to region start:\t{} --> {}\n"
.format(region.rel_pos['rstart'][0], region.rel_pos['rstart'][1]))

else:
outfile.write("\nRetrieved Region:\t{}".format(region.region_name))
outfile.write("\tNucleotide Sequence:\n")
Expand Down Expand Up @@ -624,10 +621,6 @@ def output_retrieved_region(region, outfile=None):
outfile.write("\t\tPosition relative to query start:\t{} --> {}"
.format(region.rel_pos['qstart'][0], region.rel_pos['qstart'][1]))

if region.rel_pos['rstart']:
outfile.write("\t\tPosition relative to region start:\t{} --> {}\n"
.format(region.rel_pos['rstart'][0], region.rel_pos['rstart'][1]))


def output_overlap(overlap_regions, outfile=None):
"""
Expand Down Expand Up @@ -676,10 +669,6 @@ def output_overlap(overlap_regions, outfile=None):
print("\t\tPosition relative to query start:\t{} --> {}"
.format(region.rel_pos['qstart'][0], region.rel_pos['qstart'][1]))

if region.rel_pos['rstart']:
print("\t\tPosition relative to region start:\t{} --> {}\n"
.format(region.rel_pos['rstart'][0], region.rel_pos['rstart'][1]))

else:
for key in overlap_regions:
region = overlap_regions[key]
Expand Down Expand Up @@ -718,10 +707,6 @@ def output_overlap(overlap_regions, outfile=None):
outfile.write("\t\tPosition relative to query start:\t{} --> {}"
.format(region.rel_pos['qstart'][0], region.rel_pos['qstart'][1]))

if region.rel_pos['rstart']:
outfile.write("\t\tPosition relative to region start:\t{} --> {}\n"
.format(region.rel_pos['rstart'][0], region.rel_pos['rstart'][1]))


def retrieve(virus, base, ref_regions, region, qstart=1, qend='end'):
"""
Expand Down

0 comments on commit 4b58083

Please sign in to comment.