Skip to content

Commit

Permalink
Merge pull request #138 from CanDIG/daisieh/cohort-completeness
Browse files Browse the repository at this point in the history
DIG-1812: move calculation of genomic completeness stats to htsget_ingest from query
  • Loading branch information
daisieh authored Nov 14, 2024
2 parents a3391bd + db4904b commit 6dafbd0
Showing 1 changed file with 32 additions and 0 deletions.
32 changes: 32 additions & 0 deletions htsget_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,10 +224,12 @@ def htsget_ingest(ingest_json, do_not_index=False):
"errors": {},
"results": {}
}
program_ids = set()
to_index = []
status_code = 200
for sample in ingest_json:
logger.debug(f"Ingesting {sample['genomic_file_id']}, do_not_index = {do_not_index}")
program_ids.add(sample["program_id"])
result["errors"][sample["genomic_file_id"]] = []
# create the corresponding DRS objects
if "samples" not in sample or len(sample["samples"]) == 0:
Expand Down Expand Up @@ -258,6 +260,36 @@ def htsget_ingest(ingest_json, do_not_index=False):
for url in to_index:
response = requests.get(url, headers=headers, params={"do_not_index": do_not_index})

# update completeness stats for program_ids with created samples
statistics = {}
for program_id in program_ids:
url = f"{HTSGET_URL}/htsget/v1/samples"
response = requests.get(url, headers=headers, params={"cohort": program_id})
if response.status_code == 200:
for sample in response.json():
if program_id not in statistics:
statistics[program_id] = { 'genomes': 0, 'transcriptomes': 0, 'all': 0 }
if len(sample['genomes']) > 0 and len(sample['transcriptomes']) > 0:
statistics[program_id]['all'] += 1
if len(sample['genomes']) > 0:
statistics[program_id]['genomes'] += 1
if len(sample['transcriptomes']) > 0:
statistics[program_id]['transcriptomes'] += 1
else:
result["errors"] = f"Could not collect completeness stats for program: {response.text}"

for program_id in statistics:
# get the cohort
url = f"{HTSGET_URL}/ga4gh/drs/v1/cohorts"
response = requests.get(f"{url}/{program_id}", headers=headers)
if response.status_code == 200:
cohort = response.json()
cohort["statistics"] = statistics[program_id]
response = requests.post(url, headers=headers, json=cohort)
if response.status_code != 200:
result["errors"] = f"Could not add statistics for program: {response.text}"
else:
result["errors"] = f"Could not add statistics for program: {response.text}"
return result, status_code


Expand Down

0 comments on commit 6dafbd0

Please sign in to comment.