Skip to content

Commit

Permalink
Amend parser to make use of additional info and get urs coords right
Browse files Browse the repository at this point in the history
  • Loading branch information
afg1 committed Sep 8, 2023
1 parent 311a794 commit bbbb6f6
Showing 1 changed file with 14 additions and 11 deletions.
25 changes: 14 additions & 11 deletions rnacentral_pipeline/databases/rediportal/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re

import attr
import numpy as np
import pandas as pd
import pybedtools as pbt
from attr.validators import instance_of as is_a
Expand All @@ -23,8 +24,8 @@ def build(cls, raw):
chromosome = re.sub("^chr", "", raw.chrom)
return cls(
chromosome=chromosome,
start=raw.start_rel_genome + 1, ## BED is 0-based, so add 1
stop=raw.end_rel_genome + 1,
start=raw.start_rel_genome,
stop=raw.end_rel_genome,
)


Expand All @@ -48,8 +49,8 @@ def build(cls, raw_feature):
repeat_type=raw_feature.repeat_type,
ref=raw_feature.Ref,
ed=raw_feature.Ed,
start=raw_feature.start_rel_URS + 1, ## BED is 0-based, so add 1
stop=raw_feature.end_rel_URS + 1,
start=raw_feature.start_rel_URS,
stop=raw_feature.end_rel_URS,
genomic_location=GenomicLocation.build(raw_feature),
)

Expand Down Expand Up @@ -110,7 +111,7 @@ def parse(redi_bedfile, redi_metadata, rnc_bedfile, output):
"rnc_exon_start",
"rnc_exon_end",
"urs_taxid",
"_rnc_score",
"urs_length",
"_rnc_strand",
"rnc_transcript_start",
"rnc_transcript_end",
Expand All @@ -120,19 +121,21 @@ def parse(redi_bedfile, redi_metadata, rnc_bedfile, output):
.drop(
[
"_rnc_chrom",
"_rnc_score",
"_rnc_strand",
],
axis="columns",
)
)

intersection["start_rel_URS"] = (
intersection["start_rel_genome"] - intersection["rnc_transcript_start"]
)
intersection["end_rel_URS"] = (
intersection["end_rel_genome"] - intersection["rnc_transcript_start"]
intersection["start_rel_URS"] = np.where(
intersection["strand"] == "-",
intersection["rnc_exon_start"]
- intersection["start_rel_genome"]
+ intersection["urs_length"]
- 1, # -1 because bed is 0-based interplay with length
intersection["start_rel_genome"] - intersection["rnc_transcript_start"],
)
intersection["end_rel_URS"] = intersection["start_rel_URS"]

complete_data = intersection.merge(metadata, how="inner", on="region_id")

Expand Down

0 comments on commit bbbb6f6

Please sign in to comment.