Merge pull request #178 from RNAcentral/master

Update dev branch
RNAcentral · Jan 9, 2024 · 12076d3 · 12076d3
2 parents ba27977 + 0bbd082
commit 12076d3
Show file tree

Hide file tree

Showing 35 changed files with 503 additions and 625 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/files/import-data/expressionatlas/lookup-dump-query.sql b/files/import-data/expressionatlas/lookup-dump-query.sql
@@ -13,5 +13,7 @@ COPY(
   JOIN rna
   ON xref.upi = rna.upi
 
+  WHERE xref.deleted = 'N'
+
 
   ) TO STDOUT CSV HEADER
diff --git a/files/r2dt/find-sequences.sql b/files/r2dt/find-sequences.sql
@@ -5,8 +5,10 @@ SELECT
     'sequence', COALESCE(rna.seq_short, rna.seq_long)
   )
 FROM rna
+JOIN xref on xref.upi = rna.upi
 WHERE
   not exists(select 1 from pipeline_tracking_traveler track where track.urs = rna.upi)
   AND rna.len < :max_len
+  AND xref.deleted = 'N'
   LIMIT :sequence_count
 ) TO STDOUT;
diff --git a/files/r2dt/should-show/update.ctl b/files/r2dt/should-show/update.ctl
@@ -11,8 +11,7 @@ TARGET COLUMNS (
 
 WITH
   FIELDS ESCAPED BY double-quote,
-  FIELDS TERMINATED BY ',',
-  SKIP header = 1
+  FIELDS TERMINATED BY ','
 
 BEFORE LOAD DO
 $$

diff --git a/files/search-export/parts/litsumm.sql b/files/search-export/parts/litsumm.sql
@@ -9,5 +9,7 @@ COPY (
     JOIN litsumm_summaries lss
     ON
       todo.urs_taxid = lss.primary_id
+    WHERE
+      lss.should_show = true
     ORDER by todo.id
 ) TO STDOUT
diff --git a/rnacentral_pipeline/cli/expressionatlas.py b/rnacentral_pipeline/cli/expressionatlas.py
@@ -42,4 +42,7 @@ def process_csv(csv_file, output, db_url):
     """
     entries = parser.parse(csv_file, db_url)
     with entry_writer(Path(output)) as writer:
-        writer.write(entries)
+        try:
+            writer.write(entries)
+        except ValueError:
+            print("No entries from this chunk")
diff --git a/rnacentral_pipeline/databases/data/databases.py b/rnacentral_pipeline/databases/data/databases.py
@@ -55,6 +55,7 @@ class Database(enum.Enum):
     lncrnadb = DatabaseValue(19, "lncRNAdb")
     malacards = DatabaseValue(20, "MalaCards")
     mgi = DatabaseValue(21, "MGI")
+    mgnify = DatabaseValue(55, "MGNIFY")
     mirbase = DatabaseValue(22, "miRBase")
     mirgenedb = DatabaseValue(23, "MirGeneDB")
     modomics = DatabaseValue(24, "Modomics")

diff --git a/rnacentral_pipeline/databases/data/regions.py b/rnacentral_pipeline/databases/data/regions.py
@@ -13,9 +13,8 @@
 limitations under the License.
 """
 
-import operator as op
-
 import enum
+import operator as op
 
 import attr
 from attr.validators import instance_of as is_a
@@ -308,7 +307,7 @@ def writeable(self, accession, is_upi=False, require_strand=True):
                 name,
                 self.chromosome,
                 self.strand.display_int(),
-                self.assembly_id,
+                self.assembly_id.strip(),
                 len(self.exons),
                 normalized.start,
                 normalized.stop,

diff --git a/rnacentral_pipeline/databases/expressionatlas/helpers.py b/rnacentral_pipeline/databases/expressionatlas/helpers.py
@@ -59,6 +59,13 @@ def references(interactions):
     return list(refs)
 
 
+def rna_type(type_str):
+    if type_str is None:
+        return "SO:0000655"
+    else:
+        return type_str
+
+
 def as_entry(info, experiment):
     synonyms = list(
         filter(None, [""] if info["Gene Name"] == [None] else info["Gene Name"])
@@ -70,7 +77,7 @@ def as_entry(info, experiment):
         database="Expression Atlas",
         sequence=info["seq"][0],
         regions=region_builder(info),
-        rna_type=info["rna_type"][0],
+        rna_type=rna_type(info["rna_type"][0]),
         url=url(experiment),
         seq_version="1",
         description=info["description"][0],

diff --git a/rnacentral_pipeline/databases/expressionatlas/parser.py b/rnacentral_pipeline/databases/expressionatlas/parser.py
@@ -37,5 +37,4 @@ def parse(handle, db_url):
     for line in handle:
         hit = json.loads(line)
         for experiment in hit["experiment"]:
-            print(hit)
             yield helpers.as_entry(hit, experiment)
diff --git a/rnacentral_pipeline/databases/generic/v1.py b/rnacentral_pipeline/databases/generic/v1.py
@@ -71,7 +71,10 @@ def taxid(entry):
 
     base, tid = entry["taxonId"].split(":", 1)
     assert base == "NCBITaxon"
-    return int(tid)
+    try:
+        return int(tid)
+    except ValueError:
+        raise phy.FailedTaxonId(tid)
 
 
 def as_exon(exon, context):
@@ -368,7 +371,7 @@ def as_entry(record, context):
         accession=record["primaryId"],
         ncbi_tax_id=taxid(record),
         database=context.database,
-        sequence=record["sequence"],
+        sequence=record["sequence"].replace("U", "T"),
         regions=regions(record, context),
         rna_type=record["soTermId"],
         url=record["url"],
@@ -415,7 +418,18 @@ def key(raw):
     metadata_refs = [pub.reference(r) for r in metadata_pubs]
 
     for gene_id, records in it.groupby(ncrnas, gene):
-        entries = [as_entry(r, context) for r in records]
+        entries = []
+        for r in records:
+            try:
+                entries.append(as_entry(r, context))
+            except phy.UnknownTaxonId as e:
+                print("Unknown taxid for %s" % r["primaryId"])
+                print(f"UnknownTaxonId: {e}")
+                continue
+            except phy.FailedTaxonId as e:
+                print("Taxid failed for %s" % r["primaryId"])
+                print(f"FailingTaxonId: {e}")
+                continue
 
         if gene_id:
             entries = add_related_by_gene(entries)

diff --git a/rnacentral_pipeline/databases/ols/fetch.py b/rnacentral_pipeline/databases/ols/fetch.py
@@ -26,6 +26,7 @@
 from rnacentral_pipeline.databases.data import OntologyTerm
 
 BASE = "https://www.ebi.ac.uk/ols/api/ontologies"
+OLS4_BASE = "https://www.ebi.ac.uk/ols4/api/ontologies"
 
 
 @retry(requests.HTTPError, tries=5, delay=1)
@@ -42,8 +43,14 @@ def ontology_url(ontology):
     """
     This will fetch the base URL to use with the given ontology name.
     """
-
-    url = furl(BASE)
+    manual_lookup = {
+        "ECO": "http://purl.obolibrary.org/obo/ECO_",
+        "GO": "http://purl.obolibrary.org/obo/GO_",
+        "SO": "http://purl.obolibrary.org/obo/SO_",
+    }
+    if ontology.upper() in manual_lookup.keys():
+        return furl(manual_lookup[ontology.upper()])
+    url = furl(OLS4_BASE)
     url.path.segments.append(ontology.upper())
     info = asyncio.run(query_ols(url.url))
     return furl(info["config"]["baseUris"][0])
@@ -55,7 +62,7 @@ def term_url(term_id):
     ont_url.path.segments[-1] += rest
     iri = six.moves.urllib.parse.quote_plus(ont_url.url)
 
-    url = furl(BASE)
+    url = furl(OLS4_BASE)
     url.path.segments.extend([ontology, "terms", iri])
     return url
 
@@ -72,7 +79,6 @@ def term(term_id):
     url = term_url(term_id)
     term_info = asyncio.run(query_ols(url.url))
 
-    print(term_info)
     definition = (
         term_info["annotation"].get("definition", [None])[0]
         or term_info.get("description", [None])[0]

diff --git a/rnacentral_pipeline/rnacentral/ftp_export/id_mapping.py b/rnacentral_pipeline/rnacentral/ftp_export/id_mapping.py
@@ -26,6 +26,9 @@ def gene(result):
     if result["database"] == "ENSEMBL":
         return result["optional_id"]
 
+    if result["database"] == "MIRBASE":
+        return result["optional_id"]
+
     if result["rna_type"] == "piRNA" and result["database"] == "ENA":
         return result["product"]
 

diff --git a/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py b/rnacentral_pipeline/rnacentral/precompute/description/species_specific.py
@@ -83,6 +83,7 @@
     Database.zwd,
     Database.noncode,
     Database.evlncrnas,
+    Database.mgnify,
 ]
 """
 A dict that defines the ordered choices for each type of RNA. This is the

diff --git a/rnacentral_pipeline/rnacentral/precompute/rna_type/so_term.py b/rnacentral_pipeline/rnacentral/precompute/rna_type/so_term.py
@@ -29,6 +29,11 @@
 
 LOGGER = logging.getLogger(__name__)
 
+EXCLUDED_DATABASES = {
+    Database.genecards,
+    Database.malacards,
+}
+
 ACCEPTED_DATABASES = {
     Database.five_srrnadb,
     Database.flybase,
@@ -274,8 +279,15 @@ def fn(source) -> bool:
 def all_annotations(
     context: context.Context, sequence: seq.Sequence
 ) -> ty.List[RnaTypeAnnotation]:
+    """
+    This builds a list of RNATypeAnnotations using the accessions, R2DT and
+    Rfam hits. However, this will exclude using annotations from all databases
+    in EXCLUDED_DATABASES.
+    """
     annotations = []
     for accession in sequence.accessions:
+        if accession.database in EXCLUDED_DATABASES:
+            continue
         annotations.append(RnaTypeAnnotation.from_accession(context, accession))
     for r2dt in sequence.r2dt_hits:
         if r2dt.paired_ratio() is None or r2dt.paired_ratio() > 0.80:
@@ -296,6 +308,8 @@ def rna_type_of(
 ) -> ty.Optional[RnaType]:
 
     annotations = all_annotations(context, sequence)
+    if len(annotations) == 0:
+        return RnaType.ncRNA()
     merged = merge_annotations(annotations)
 
     if len(merged) == 1:

diff --git a/rnacentral_pipeline/rnacentral/r2dt/should_show.py b/rnacentral_pipeline/rnacentral/r2dt/should_show.py
@@ -188,7 +188,7 @@ def write(model_path: Path, handle: ty.IO, db_url: str, output: ty.IO):
         to_write = pd.DataFrame()
         to_write["urs"] = frame["urs"]
         to_write["should_show"] = predicted.astype(int)
-        to_write.to_csv(output, index=False)
+        to_write.to_csv(output, index=False, header=False)
 
 
 def write_model(handle: ty.IO, db_url: str, output: Path):

diff --git a/rnacentral_pipeline/rnacentral/search_export/data.py b/rnacentral_pipeline/rnacentral/search_export/data.py
@@ -715,18 +715,26 @@ def has_editing_event(editing_events):
 
 def edit_chromosome(editing_events):
     ## They should all be on the same chromosome
+    if len(editing_events) == 0:
+        return []
     return editing_events[0]["chromosome"]
 
 
 def edit_locations(editing_events):
+    if len(editing_events) == 0:
+        return []
     return [edit["genomic_location"] for edit in editing_events]
 
 
 def edit_repeat_type(editing_events):
+    if len(editing_events) == 0:
+        return []
     return editing_events[0]["repeat_type"]
 
 
 def edit_ref_to_edit(editing_events):
+    if len(editing_events) == 0:
+        return []
     return [f"{edit['reference']}->{edit['edit']}" for edit in editing_events]
 
 
@@ -858,9 +866,21 @@ def edit_ref_to_edit(editing_events):
                 field("has_lit_scan", has_publications, keys="publication_count"),
                 field("has_litsumm", has_litsumm, keys="litsumm"),
                 field("has_editing_event", has_editing_event, keys="editing_events"),
-                field("edit_chromosome", edit_chromosome, keys="editing_events"),
-                field("edit_locations", edit_locations, keys="editing_events"),
-                field("edit_repeat_type", edit_repeat_type, keys="editing_events"),
+                field(
+                    "edit_chromosome",
+                    edit_chromosome,
+                    keys="editing_events",
+                ),
+                field(
+                    "edit_locations",
+                    edit_locations,
+                    keys="editing_events",
+                ),
+                field(
+                    "edit_repeat_type",
+                    edit_repeat_type,
+                    keys="editing_events",
+                ),
                 ## Add new fields above this line! Otherwise editing the produced xml is hard.
                 tree("so_rna_type", so_rna_type_tree, key="so_rna_type_tree"),
             ],

diff --git a/tests/rnacentral/ftp_export/id_mapping_test.py b/tests/rnacentral/ftp_export/id_mapping_test.py
@@ -122,6 +122,15 @@ def test_can_create_accession(data, expected):
             },
             "bob",
         ),
+        (
+            {
+                "gene": "gene1",
+                "database": "MIRBASE",
+                "optional_id": "hsa-mir-1",
+                "rna_type": "miRNA",
+            },
+            "hsa-mir-1",
+        ),
     ],
 )
 def test_can_generate_gene(data, expected):
@@ -223,6 +232,16 @@ def test_as_entry_works_correctly():
                 ],
             ],
         ),
+        # (
+        #     "URS000069C337_9606",
+        #     [
+        #         ["URS000069C337_9606", "ENSEMBL", "ENST00000401212", 9606, "pre-miRNA", "MIR298"],
+        #         ["URS000069C337_9606", "GENECARDS", "", 9606, "pre-miRNA", "MIR298"],
+        #         ["URS000069C337_9606", "MALACARDS", "", 9606, "pre-miRNA", "MIR298"],
+        #         ["URS000069C337_9606", "MIRBASE", "", 9606, "pre-miRNA", "MIR298"],
+        #         ["URS000069C337_9606", "REFSEQ", "", 9606, "pre-miRNA", "MIR298"],
+        #     ]
+        # ),
     ],
 )
 def test_can_create_expected_exports(rna_id, expected):

diff --git a/utils/expression-atlas/Cargo.toml b/utils/expression-atlas/Cargo.toml
@@ -12,6 +12,6 @@ log = "0.4"
 env_logger = "0.9.0"
 multimap = "0.8.3"
 clap = { version = "3.1.18", features = ["derive"] }
-polars = { version = "0.21.1", features = ["lazy", "csv-file", "rows", "abs", "is_in", "strings", "concat_str", "list", "json"] }
 quick-xml = { version = "0.22.0", features = ["serialize"] }
 serde = { version = "1.0", features = [ "derive" ] }
+polars = { version = "0.33.2", features = ["lazy", "rows", "abs", "is_in", "strings", "concat_str", "json", "streaming"] }