Skip to content

Commit

Permalink
adds support for chunking of analysis-runs ebi search dump
Browse files Browse the repository at this point in the history
  • Loading branch information
SandyRogers committed Nov 15, 2023
1 parent 2827c2b commit d18573c
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 18 deletions.
39 changes: 23 additions & 16 deletions emgapi/management/commands/ebi_search_analysis_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from typing import Optional

from django.core.management import BaseCommand
from django.core.paginator import Paginator
from django.db.models import QuerySet
from django.template.loader import render_to_string
from django.utils import timezone
Expand All @@ -45,30 +46,31 @@ def add_arguments(self, parser):
help="Create a full snapshot rather than incremental.",
)
parser.add_argument("-o", "--output", help="Output dir for xml files", required=True)
parser.add_argument("-c", "--chunk", help="Number of analyses per chunk", default=100, nargs='?', type=int)

def get_analysis_context(self, analysis: AnalysisJob):
try:
analysis_taxonomy: Optional[AnalysisJobTaxonomy] = AnalysisJobTaxonomy.objects.get(
analysis_id=str(analysis.job_id)
analysis_id=analysis.job_id
)
except AnalysisJobTaxonomy.DoesNotExist:
logger.warning(f"Could not find analysis job taxonomy for {analysis.job_id}")
logger.debug(f"Could not find analysis job taxonomy for {analysis.job_id}")
analysis_taxonomy = None

try:
go_annotation: Optional[AnalysisJobGoTerm] = AnalysisJobGoTerm.objects.get(
pk=str(analysis.job_id)
pk=analysis.job_id
)
except AnalysisJobGoTerm.DoesNotExist:
logger.warning(f"Could not find go terms for {analysis.job_id}")
logger.debug(f"Could not find go terms for {analysis.job_id}")
go_annotation = None

try:
ips_annotation: Optional[AnalysisJobInterproIdentifier] = AnalysisJobInterproIdentifier.objects.get(
pk=str(analysis.job_id)
pk=analysis.job_id
)
except AnalysisJobInterproIdentifier.DoesNotExist:
logger.warning(f"Could not find IPS terms for {analysis.job_id}")
logger.debug(f"Could not find IPS terms for {analysis.job_id}")
ips_annotation = None

biome_list = analysis.study.biome.lineage.split(":")[1:]
Expand Down Expand Up @@ -161,6 +163,7 @@ def handle(self, *args, **options):
"""Dump EBI Search XML file of analyses"""
is_full_snapshot: str = options["full"]
output_dir: str = options["output"]
chunk_size: int = options["chunk"]

pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

Expand All @@ -183,17 +186,21 @@ def handle(self, *args, **options):
)
)

additions_file = pathlib.Path(output_dir) / pathlib.Path('analyses.xml')
with open(additions_file, 'w') as a:
self.write_without_blank_lines(a,
render_to_string(
"ebi_search/analyses.xml",
{
"additions": (self.get_analysis_context(analysis) for analysis in analyses),
"count": analyses.count()
}
paginated_analyses = Paginator(analyses, chunk_size)

for page in paginated_analyses:
logger.info(f"Dumping {page.number = }/{paginated_analyses.num_pages}")
additions_file = pathlib.Path(output_dir) / pathlib.Path(f'analyses_{page.number:04}.xml')
with open(additions_file, 'w') as a:
self.write_without_blank_lines(a,
render_to_string(
"ebi_search/analyses.xml",
{
"additions": (self.get_analysis_context(analysis) for analysis in page),
"count": len(page)
}
)
)
)

nowish = timezone.now() + timedelta(minutes=1)
# Small buffer into the future so that the indexing time remains ahead of auto-now updated times.
Expand Down
2 changes: 1 addition & 1 deletion emgcli/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__: str = "2.4.37"
__version__: str = "2.4.38"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ max-line-length = 119
"""

[tool.bumpversion]
current_version = "2.4.37"
current_version = "2.4.38"

[[tool.bumpversion.files]]
filename = "emgcli/__init__.py"

0 comments on commit d18573c

Please sign in to comment.