Skip to content

Commit

Permalink
Fix off by one error and lowercase cds
Browse files Browse the repository at this point in the history
  • Loading branch information
pvanheus committed Oct 20, 2022
1 parent 5f34b67 commit 7bb82d5
Showing 1 changed file with 25 additions and 12 deletions.
37 changes: 25 additions & 12 deletions tools/process_spaln_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def fix(attrs, region_name, number):

output_filename = "spaln_out.gff3"
output_file = open(output_filename, "w")
print('##gff-version\t3', file=output_file)
print("##gff-version\t3", file=output_file)
input_filenames = sys.argv[1:]
seen_header = False
gene_number = 0
Expand All @@ -42,7 +42,7 @@ def fix(attrs, region_name, number):
max_end = 0
for input_filename in input_filenames:
for line in open(input_filename):
if line.startswith('##sequence-region'):
if line.startswith("##sequence-region"):
parts = line.split()
orig_region_name = parts[1]
location_parts = parts[1].split(":")
Expand All @@ -51,10 +51,10 @@ def fix(attrs, region_name, number):
if region_name not in annotation_parts:
annotation_parts[region_name] = []
annotation_parts[region_name].append((start, region_name, input_filename))
elif not line.startswith('#'):
elif not line.startswith("#"):
fields = line.rstrip().split("\t")
seq_type = fields[2]
if seq_type == 'gene':
if seq_type == "gene":
# end is fields[4]
max_end = int(fields[4]) + start
else:
Expand All @@ -68,26 +68,39 @@ def fix(attrs, region_name, number):
sequence_region_added = False
for start, region_name, input_filename in sorted(locations, key=itemgetter(0)):
if not sequence_region_added:
assert start == start_position[region_name], "mismatch between this start position {} and computed start position {} for {}".format(start, start_position[region_name], region_name)
assert (
start == start_position[region_name]
), "mismatch between this start position {} and computed start position {} for {}".format(
start, start_position[region_name], region_name
)
# started a new contig, add a sequence region header that describes the region under annotation
print('##sequence-region\t{} {} {}'.format(region_name, start, end_position[region_name]), file=output_file)
print(
"##sequence-region\t{} {} {}".format(
region_name, start, end_position[region_name]
),
file=output_file,
)
sequence_region_added = True

for line in open(input_filename):
if line.startswith("##sequence-region") or line.startswith('##gff-version'):
if line.startswith("##sequence-region") or line.startswith("##gff-version"):
continue
elif line.startswith('##FASTA'):
elif line.startswith("##FASTA"):
break
elif line.startswith('##'):
print('Warning: unexpected directive {}'.format(line.strip()), file=output_file)
elif line.startswith("##"):
print(
"Warning: unexpected directive {}".format(line.strip()),
file=output_file,
)
continue
fields = line.rstrip().split("\t")
seq_type = fields[2]
if seq_type == "gene":
gene_number += 1
fields[0] = region_name
fields[3] = str(int(fields[3]) + start)
fields[4] = str(int(fields[4]) + start)
fields[2] = "CDS" if fields[2] == "cds" else fields[2]
fields[3] = str(int(fields[3]) + start - 1)
fields[4] = str(int(fields[4]) + start - 1)
fields[8] = fix(fields[8], region_name, gene_number)
new_line = "\t".join(fields)
print(new_line, end="", file=output_file)
Expand Down

0 comments on commit 7bb82d5

Please sign in to comment.