Skip to content

Commit

Permalink
adds ebi search dump for studies/projects (plus some tweaks to analyses)
Browse files Browse the repository at this point in the history
  • Loading branch information
SandyRogers committed Oct 24, 2023
1 parent 14f0842 commit 1a3acfd
Show file tree
Hide file tree
Showing 8 changed files with 199 additions and 3 deletions.
3 changes: 2 additions & 1 deletion emgapi/management/commands/ebi_search_analysis_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,8 @@ def handle(self, *args, **options):
render_to_string(
"ebi_search/analyses.xml",
{
"additions": (self.get_analysis_context(analysis) for analysis in analyses)
"additions": (self.get_analysis_context(analysis) for analysis in analyses),
"count": analyses.count()
}
)
)
Expand Down
108 changes: 108 additions & 0 deletions emgapi/management/commands/ebi_search_study_dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Copyright 2017-2023 EMBL - European Bioinformatics Institute
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
import pathlib
from datetime import timedelta

from django.core.management import BaseCommand
from django.db.models import QuerySet
from django.template.loader import render_to_string
from django.utils import timezone

from emgapi.models import Study

logger = logging.getLogger(__name__)


class Command(BaseCommand):
help = "Generate the XML dump of studies for EBI Search."

def add_arguments(self, parser):
super(Command, self).add_arguments(parser)
parser.add_argument(
"--full",
action="store_true",
help="Create a full snapshot rather than incremental.",
)
parser.add_argument("-o", "--output", help="Output dir for xml files", required=True)


@staticmethod
def write_without_blank_lines(fp, string):
fp.write(
"\n".join(
filter(
str.strip,
string.splitlines()
)
)
)

@staticmethod
def get_study_context(study: Study):
biome_list = study.biome.lineage.split(":")[1:]

return {
"study": study,
"biome_list": biome_list
}

def handle(self, *args, **options):
"""Dump EBI Search XML file of studies/projects"""
is_full_snapshot: str = options["full"]
output_dir: str = options["output"]

pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

studies: QuerySet = Study.objects.available(None)

if not is_full_snapshot:
studies = Study.objects_for_indexing.to_add()

removals = Study.objects_for_indexing.to_delete()

# produce incremental deletion file
deletions_file = pathlib.Path(output_dir) / pathlib.Path('projects-deletes.xml')
with open(deletions_file, 'w') as d:
self.write_without_blank_lines(d,
render_to_string(
"ebi_search/projects-deletes.xml",
{
"removals": removals
}
)
)

additions_file = pathlib.Path(output_dir) / pathlib.Path('projects.xml')
with open(additions_file, 'w') as a:
self.write_without_blank_lines(a,
render_to_string(
"ebi_search/projects.xml",
{
"additions": (self.get_study_context(study) for study in studies),
"count": studies.count()
}
)
)

nowish = timezone.now() + timedelta(minutes=1)
# Small buffer into the future so that the indexing time remains ahead of auto-now updated times.

for study in studies:
study.last_indexed = nowish

Study.objects.bulk_update(studies, fields=["last_indexed"])
18 changes: 18 additions & 0 deletions emgapi/migrations/0013_study_last_indexed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.18 on 2023-10-23 16:41

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('emgapi', '0012_auto_20231020_1525'),
]

operations = [
migrations.AddField(
model_name='study',
name='last_indexed',
field=models.DateTimeField(blank=True, db_column='LAST_INDEXED', help_text='Date at which this model was last included in an EBI Search initial/incremental index.', null=True),
),
]
18 changes: 18 additions & 0 deletions emgapi/migrations/0014_alter_study_last_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Generated by Django 3.2.18 on 2023-10-23 17:16

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('emgapi', '0013_study_last_indexed'),
]

operations = [
migrations.AlterField(
model_name='study',
name='last_update',
field=models.DateTimeField(auto_now=True, db_column='LAST_UPDATE'),
),
]
4 changes: 2 additions & 2 deletions emgapi/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -893,7 +893,7 @@ def mydata(self, request):
return self.get_queryset().mydata(request)


class Study(ENASyncableModel):
class Study(ENASyncableModel, EbiSearchIndexedModel):

def __init__(self, *args, **kwargs):
super(Study, self).__init__(*args, **kwargs)
Expand Down Expand Up @@ -927,7 +927,7 @@ def _custom_pk(self):
author_name = models.CharField(
db_column='AUTHOR_NAME', max_length=100, blank=True, null=True)
last_update = models.DateTimeField(
db_column='LAST_UPDATE')
db_column='LAST_UPDATE', auto_now=True)
submission_account_id = models.CharField(
db_column='SUBMISSION_ACCOUNT_ID',
max_length=15, blank=True, null=True)
Expand Down
3 changes: 3 additions & 0 deletions emgapi/templates/ebi_search/analyses.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
<database xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://www.ebi.ac.uk/ebisearch/XML4dbDumps.xsd">
<name>EMG_run</name>
<description>EMG Analysis runs – samples analysed by MGnify pipelines</description>
<release>{% now "Y-m-d" %}</release>
<entry_count>{{ count }}</entry_count>
<entries>
{% for a in additions %}
{% include "ebi_search/analysis.xml" with analysis=a.analysis analysis_biome=a.analysis_biome analysis_taxonomies=a.analysis_taxonomies analysis_go_entries=a.analysis_go_entries analysis_ips_entries=a.analysis_ips_entries sample_metadata=a.sample_metadata only %}
Expand Down
8 changes: 8 additions & 0 deletions emgapi/templates/ebi_search/projects-deletes.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<database xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://www.ebi.ac.uk/ebisearch/XML4dbDumps.xsd">
<name>EMG_Project</name>
<entries>
{% for entry in removals %}
<entry id="{{ entry.accession }}" />
{% endfor %}
</entries>
</database>
40 changes: 40 additions & 0 deletions emgapi/templates/ebi_search/projects.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
<database xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="https://www.ebi.ac.uk/ebisearch/XML4dbDumps.xsd">
<name>EMG_Project</name>
<description>EMG Projects – studies analysed by MGnify</description>
<release>{% now "Y-m-d" %}</release>
<entry_count>{{ count }}</entry_count>
<entries>
{% for addition in additions %}
{% with addition.study as study %}
<entry xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" id="{{ study.accession }}">
<name>{{ study.study_name | safe }}</name>
<description>{{ study.study_abstract | safe }}</description>
<dates>
<date type="creation_date" value="{{ study.first_created|date:'Y-m-d' }}"/>
<date type="last_modification_date" value="{{ study.last_update|date:'Y-m-d' }}"/>
</dates>
<additional_fields>
<field name="secondary_acc">{{ study.secondary_accession }}</field>
<field name="biome_name">{{ study.biome.biome_name }}</field>
<hierarchical_field name="biome">
{% for biome_element in addition.biome_list %}
{% if forloop.first %}
<root>{{ biome_element | safe }}</root>
{% else %}
<child>{{ biome_element | safe }}</child>
{% endif %}
{% endfor %}
</hierarchical_field>
<field name="centre_name">{{ study.centre_name | safe }}</field>
</additional_fields>
<cross_references>
<ref dbkey="{{ study.project_id }}" dbname="ena_project"/>
{% for analysis in study.analyses.all %}
<ref dbkey="{{ analysis.accession }}_{{ analysis.pipeline.release_version }}" dbname="metagenomics_analyses"/>
{% endfor %}
</cross_references>
</entry>
{% endwith %}
{% endfor %}
</entries>
</database>

0 comments on commit 1a3acfd

Please sign in to comment.