Skip to content

Commit

Permalink
Merge pull request #178 from RNAcentral/master
Browse files Browse the repository at this point in the history
Update dev branch
  • Loading branch information
carlosribas authored Jan 9, 2024
2 parents ba27977 + 0bbd082 commit 12076d3
Show file tree
Hide file tree
Showing 35 changed files with 503 additions and 625 deletions.
779 changes: 266 additions & 513 deletions Cargo.lock

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions files/import-data/expressionatlas/lookup-dump-query.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,7 @@ COPY(
JOIN rna
ON xref.upi = rna.upi

WHERE xref.deleted = 'N'


) TO STDOUT CSV HEADER
2 changes: 2 additions & 0 deletions files/r2dt/find-sequences.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@ SELECT
'sequence', COALESCE(rna.seq_short, rna.seq_long)
)
FROM rna
JOIN xref on xref.upi = rna.upi
WHERE
not exists(select 1 from pipeline_tracking_traveler track where track.urs = rna.upi)
AND rna.len < :max_len
AND xref.deleted = 'N'
LIMIT :sequence_count
) TO STDOUT;
3 changes: 1 addition & 2 deletions files/r2dt/should-show/update.ctl
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ TARGET COLUMNS (

WITH
FIELDS ESCAPED BY double-quote,
FIELDS TERMINATED BY ',',
SKIP header = 1
FIELDS TERMINATED BY ','

BEFORE LOAD DO
$$
Expand Down
2 changes: 2 additions & 0 deletions files/search-export/parts/litsumm.sql
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,7 @@ COPY (
JOIN litsumm_summaries lss
ON
todo.urs_taxid = lss.primary_id
WHERE
lss.should_show = true
ORDER by todo.id
) TO STDOUT
5 changes: 4 additions & 1 deletion rnacentral_pipeline/cli/expressionatlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,4 +42,7 @@ def process_csv(csv_file, output, db_url):
"""
entries = parser.parse(csv_file, db_url)
with entry_writer(Path(output)) as writer:
writer.write(entries)
try:
writer.write(entries)
except ValueError:
print("No entries from this chunk")
1 change: 1 addition & 0 deletions rnacentral_pipeline/databases/data/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class Database(enum.Enum):
lncrnadb = DatabaseValue(19, "lncRNAdb")
malacards = DatabaseValue(20, "MalaCards")
mgi = DatabaseValue(21, "MGI")
mgnify = DatabaseValue(55, "MGNIFY")
mirbase = DatabaseValue(22, "miRBase")
mirgenedb = DatabaseValue(23, "MirGeneDB")
modomics = DatabaseValue(24, "Modomics")
Expand Down
5 changes: 2 additions & 3 deletions rnacentral_pipeline/databases/data/regions.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,8 @@
limitations under the License.
"""

import operator as op

import enum
import operator as op

import attr
from attr.validators import instance_of as is_a
Expand Down Expand Up @@ -308,7 +307,7 @@ def writeable(self, accession, is_upi=False, require_strand=True):
name,
self.chromosome,
self.strand.display_int(),
self.assembly_id,
self.assembly_id.strip(),
len(self.exons),
normalized.start,
normalized.stop,
Expand Down
9 changes: 8 additions & 1 deletion rnacentral_pipeline/databases/expressionatlas/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ def references(interactions):
return list(refs)


def rna_type(type_str):
if type_str is None:
return "SO:0000655"
else:
return type_str


def as_entry(info, experiment):
synonyms = list(
filter(None, [""] if info["Gene Name"] == [None] else info["Gene Name"])
Expand All @@ -70,7 +77,7 @@ def as_entry(info, experiment):
database="Expression Atlas",
sequence=info["seq"][0],
regions=region_builder(info),
rna_type=info["rna_type"][0],
rna_type=rna_type(info["rna_type"][0]),
url=url(experiment),
seq_version="1",
description=info["description"][0],
Expand Down
1 change: 0 additions & 1 deletion rnacentral_pipeline/databases/expressionatlas/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,5 +37,4 @@ def parse(handle, db_url):
for line in handle:
hit = json.loads(line)
for experiment in hit["experiment"]:
print(hit)
yield helpers.as_entry(hit, experiment)
20 changes: 17 additions & 3 deletions rnacentral_pipeline/databases/generic/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,10 @@ def taxid(entry):

base, tid = entry["taxonId"].split(":", 1)
assert base == "NCBITaxon"
return int(tid)
try:
return int(tid)
except ValueError:
raise phy.FailedTaxonId(tid)


def as_exon(exon, context):
Expand Down Expand Up @@ -368,7 +371,7 @@ def as_entry(record, context):
accession=record["primaryId"],
ncbi_tax_id=taxid(record),
database=context.database,
sequence=record["sequence"],
sequence=record["sequence"].replace("U", "T"),
regions=regions(record, context),
rna_type=record["soTermId"],
url=record["url"],
Expand Down Expand Up @@ -415,7 +418,18 @@ def key(raw):
metadata_refs = [pub.reference(r) for r in metadata_pubs]

for gene_id, records in it.groupby(ncrnas, gene):
entries = [as_entry(r, context) for r in records]
entries = []
for r in records:
try:
entries.append(as_entry(r, context))
except phy.UnknownTaxonId as e:
print("Unknown taxid for %s" % r["primaryId"])
print(f"UnknownTaxonId: {e}")
continue
except phy.FailedTaxonId as e:
print("Taxid failed for %s" % r["primaryId"])
print(f"FailingTaxonId: {e}")
continue

if gene_id:
entries = add_related_by_gene(entries)
Expand Down
14 changes: 10 additions & 4 deletions rnacentral_pipeline/databases/ols/fetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from rnacentral_pipeline.databases.data import OntologyTerm

BASE = "https://www.ebi.ac.uk/ols/api/ontologies"
OLS4_BASE = "https://www.ebi.ac.uk/ols4/api/ontologies"


@retry(requests.HTTPError, tries=5, delay=1)
Expand All @@ -42,8 +43,14 @@ def ontology_url(ontology):
"""
This will fetch the base URL to use with the given ontology name.
"""

url = furl(BASE)
manual_lookup = {
"ECO": "http://purl.obolibrary.org/obo/ECO_",
"GO": "http://purl.obolibrary.org/obo/GO_",
"SO": "http://purl.obolibrary.org/obo/SO_",
}
if ontology.upper() in manual_lookup.keys():
return furl(manual_lookup[ontology.upper()])
url = furl(OLS4_BASE)
url.path.segments.append(ontology.upper())
info = asyncio.run(query_ols(url.url))
return furl(info["config"]["baseUris"][0])
Expand All @@ -55,7 +62,7 @@ def term_url(term_id):
ont_url.path.segments[-1] += rest
iri = six.moves.urllib.parse.quote_plus(ont_url.url)

url = furl(BASE)
url = furl(OLS4_BASE)
url.path.segments.extend([ontology, "terms", iri])
return url

Expand All @@ -72,7 +79,6 @@ def term(term_id):
url = term_url(term_id)
term_info = asyncio.run(query_ols(url.url))

print(term_info)
definition = (
term_info["annotation"].get("definition", [None])[0]
or term_info.get("description", [None])[0]
Expand Down
3 changes: 3 additions & 0 deletions rnacentral_pipeline/rnacentral/ftp_export/id_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ def gene(result):
if result["database"] == "ENSEMBL":
return result["optional_id"]

if result["database"] == "MIRBASE":
return result["optional_id"]

if result["rna_type"] == "piRNA" and result["database"] == "ENA":
return result["product"]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
Database.zwd,
Database.noncode,
Database.evlncrnas,
Database.mgnify,
]
"""
A dict that defines the ordered choices for each type of RNA. This is the
Expand Down
14 changes: 14 additions & 0 deletions rnacentral_pipeline/rnacentral/precompute/rna_type/so_term.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@

LOGGER = logging.getLogger(__name__)

EXCLUDED_DATABASES = {
Database.genecards,
Database.malacards,
}

ACCEPTED_DATABASES = {
Database.five_srrnadb,
Database.flybase,
Expand Down Expand Up @@ -274,8 +279,15 @@ def fn(source) -> bool:
def all_annotations(
context: context.Context, sequence: seq.Sequence
) -> ty.List[RnaTypeAnnotation]:
"""
This builds a list of RNATypeAnnotations using the accessions, R2DT and
Rfam hits. However, this will exclude using annotations from all databases
in EXCLUDED_DATABASES.
"""
annotations = []
for accession in sequence.accessions:
if accession.database in EXCLUDED_DATABASES:
continue
annotations.append(RnaTypeAnnotation.from_accession(context, accession))
for r2dt in sequence.r2dt_hits:
if r2dt.paired_ratio() is None or r2dt.paired_ratio() > 0.80:
Expand All @@ -296,6 +308,8 @@ def rna_type_of(
) -> ty.Optional[RnaType]:

annotations = all_annotations(context, sequence)
if len(annotations) == 0:
return RnaType.ncRNA()
merged = merge_annotations(annotations)

if len(merged) == 1:
Expand Down
2 changes: 1 addition & 1 deletion rnacentral_pipeline/rnacentral/r2dt/should_show.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ def write(model_path: Path, handle: ty.IO, db_url: str, output: ty.IO):
to_write = pd.DataFrame()
to_write["urs"] = frame["urs"]
to_write["should_show"] = predicted.astype(int)
to_write.to_csv(output, index=False)
to_write.to_csv(output, index=False, header=False)


def write_model(handle: ty.IO, db_url: str, output: Path):
Expand Down
26 changes: 23 additions & 3 deletions rnacentral_pipeline/rnacentral/search_export/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -715,18 +715,26 @@ def has_editing_event(editing_events):

def edit_chromosome(editing_events):
## They should all be on the same chromosome
if len(editing_events) == 0:
return []
return editing_events[0]["chromosome"]


def edit_locations(editing_events):
if len(editing_events) == 0:
return []
return [edit["genomic_location"] for edit in editing_events]


def edit_repeat_type(editing_events):
if len(editing_events) == 0:
return []
return editing_events[0]["repeat_type"]


def edit_ref_to_edit(editing_events):
if len(editing_events) == 0:
return []
return [f"{edit['reference']}->{edit['edit']}" for edit in editing_events]


Expand Down Expand Up @@ -858,9 +866,21 @@ def edit_ref_to_edit(editing_events):
field("has_lit_scan", has_publications, keys="publication_count"),
field("has_litsumm", has_litsumm, keys="litsumm"),
field("has_editing_event", has_editing_event, keys="editing_events"),
field("edit_chromosome", edit_chromosome, keys="editing_events"),
field("edit_locations", edit_locations, keys="editing_events"),
field("edit_repeat_type", edit_repeat_type, keys="editing_events"),
field(
"edit_chromosome",
edit_chromosome,
keys="editing_events",
),
field(
"edit_locations",
edit_locations,
keys="editing_events",
),
field(
"edit_repeat_type",
edit_repeat_type,
keys="editing_events",
),
## Add new fields above this line! Otherwise editing the produced xml is hard.
tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"),
],
Expand Down
19 changes: 19 additions & 0 deletions tests/rnacentral/ftp_export/id_mapping_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,15 @@ def test_can_create_accession(data, expected):
},
"bob",
),
(
{
"gene": "gene1",
"database": "MIRBASE",
"optional_id": "hsa-mir-1",
"rna_type": "miRNA",
},
"hsa-mir-1",
),
],
)
def test_can_generate_gene(data, expected):
Expand Down Expand Up @@ -223,6 +232,16 @@ def test_as_entry_works_correctly():
],
],
),
# (
# "URS000069C337_9606",
# [
# ["URS000069C337_9606", "ENSEMBL", "ENST00000401212", 9606, "pre-miRNA", "MIR298"],
# ["URS000069C337_9606", "GENECARDS", "", 9606, "pre-miRNA", "MIR298"],
# ["URS000069C337_9606", "MALACARDS", "", 9606, "pre-miRNA", "MIR298"],
# ["URS000069C337_9606", "MIRBASE", "", 9606, "pre-miRNA", "MIR298"],
# ["URS000069C337_9606", "REFSEQ", "", 9606, "pre-miRNA", "MIR298"],
# ]
# ),
],
)
def test_can_create_expected_exports(rna_id, expected):
Expand Down
2 changes: 1 addition & 1 deletion utils/expression-atlas/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ log = "0.4"
env_logger = "0.9.0"
multimap = "0.8.3"
clap = { version = "3.1.18", features = ["derive"] }
polars = { version = "0.21.1", features = ["lazy", "csv-file", "rows", "abs", "is_in", "strings", "concat_str", "list", "json"] }
quick-xml = { version = "0.22.0", features = ["serialize"] }
serde = { version = "1.0", features = [ "derive" ] }
polars = { version = "0.33.2", features = ["lazy", "rows", "abs", "is_in", "strings", "concat_str", "json", "streaming"] }
Loading

0 comments on commit 12076d3

Please sign in to comment.