diff --git a/cg_lims/EPPs/udf/calculate/base.py b/cg_lims/EPPs/udf/calculate/base.py index d935d162..0b037f73 100644 --- a/cg_lims/EPPs/udf/calculate/base.py +++ b/cg_lims/EPPs/udf/calculate/base.py @@ -25,6 +25,7 @@ from cg_lims.EPPs.udf.calculate.ont_aliquot_volume import ont_aliquot_volume from cg_lims.EPPs.udf.calculate.ont_sequencing_reload import ont_available_sequencing_reload from cg_lims.EPPs.udf.calculate.pool_normalization import pool_normalization +from cg_lims.EPPs.udf.calculate.qpcr_concentration import qpcr_concentration from cg_lims.EPPs.udf.calculate.sum_missing_reads_in_pool import missing_reads_in_pool from cg_lims.EPPs.udf.calculate.twist_aliquot_amount import twist_aliquot_amount from cg_lims.EPPs.udf.calculate.twist_get_volumes_from_buffer import get_volumes_from_buffer @@ -61,6 +62,7 @@ def calculate(ctx): calculate.add_command(novaseq_x_volumes) calculate.add_command(pool_normalization) calculate.add_command(novaseq_x_denaturation) +calculate.add_command(qpcr_concentration) calculate.add_command(calculate_saphyr_concentration) calculate.add_command(ont_aliquot_volume) calculate.add_command(ont_available_sequencing_reload) diff --git a/cg_lims/EPPs/udf/calculate/constants.py b/cg_lims/EPPs/udf/calculate/constants.py index 13c85b3e..9a3f7d2a 100644 --- a/cg_lims/EPPs/udf/calculate/constants.py +++ b/cg_lims/EPPs/udf/calculate/constants.py @@ -44,5 +44,392 @@ class FlowCellLaneVolumes25B(FloatEnum): BUFFER_VOLUME: float = 210 +WELL_TRANSFORMER = { + "A01": "A:1", + "A02": "A:1", + "A03": "A:2", + "A04": "A:2", + "A05": "A:3", + "A06": "A:3", + "A07": "A:4", + "A08": "A:4", + "A09": "A:5", + "A10": "A:5", + "A11": "A:6", + "A12": "A:6", + "A13": "A:7", + "A14": "A:7", + "A15": "A:8", + "A16": "A:8", + "A17": "A:9", + "A18": "A:9", + "A19": "A:10", + "A20": "A:10", + "A21": "A:11", + "A22": "A:11", + "A23": "A:12", + "A24": "A:12", + "B01": "A:1", + "B02": "Std:1", + "B03": "A:2", + "B04": "Std:1", + "B05": "A:3", + "B06": "Std:1", + "B07": "A:4", + "B08": "", + "B09": "A:5", + "B10": "", + "B11": "A:6", + "B12": "", + "B13": "A:7", + "B14": "", + "B15": "A:8", + "B16": "", + "B17": "A:9", + "B18": "", + "B19": "A:10", + "B20": "", + "B21": "A:11", + "B22": "", + "B23": "A:12", + "B24": "", + "C01": "B:1", + "C02": "B:1", + "C03": "B:2", + "C04": "B:2", + "C05": "B:3", + "C06": "B:3", + "C07": "B:4", + "C08": "B:4", + "C09": "B:5", + "C10": "B:5", + "C11": "B:6", + "C12": "B:6", + "C13": "B:7", + "C14": "B:7", + "C15": "B:8", + "C16": "B:8", + "C17": "B:9", + "C18": "B:9", + "C19": "B:10", + "C20": "B:10", + "C21": "B:11", + "C22": "B:11", + "C23": "B:12", + "C24": "B:12", + "D01": "B:1", + "D02": "Std:2", + "D03": "B:2", + "D04": "Std:2", + "D05": "B:3", + "D06": "Std:2", + "D07": "B:4", + "D08": "", + "D09": "B:5", + "D10": "", + "D11": "B:6", + "D12": "", + "D13": "B:7", + "D14": "", + "D15": "B:8", + "D16": "", + "D17": "B:9", + "D18": "", + "D19": "B:10", + "D20": "", + "D21": "B:11", + "D22": "", + "D23": "B:12", + "D24": "", + "E01": "C:1", + "E02": "C:1", + "E03": "C:2", + "E04": "C:2", + "E05": "C:3", + "E06": "C:3", + "E07": "C:4", + "E08": "C:4", + "E09": "C:5", + "E10": "C:5", + "E11": "C:6", + "E12": "C:6", + "E13": "C:7", + "E14": "C:7", + "E15": "C:8", + "E16": "C:8", + "E17": "C:9", + "E18": "C:9", + "E19": "C:10", + "E20": "C:10", + "E21": "C:11", + "E22": "C:11", + "E23": "C:12", + "E24": "C:12", + "F01": "C:1", + "F02": "Std:3", + "F03": "C:2", + "F04": "Std:3", + "F05": "C:3", + "F06": "Std:3", + "F07": "C:4", + "F08": "", + "F09": "C:5", + "F10": "", + "F11": "C:6", + "F12": "", + "F13": "C:7", + "F14": "", + "F15": "C:8", + "F16": "", + "F17": "C:9", + "F18": "", + "F19": "C:10", + "F20": "", + "F21": "C:11", + "F22": "", + "F23": "C:12", + "F24": "", + "G01": "D:1", + "G02": "D:1", + "G03": "D:2", + "G04": "D:2", + "G05": "D:3", + "G06": "D:3", + "G07": "D:4", + "G08": "D:4", + "G09": "D:5", + "G10": "D:5", + "G11": "D:6", + "G12": "D:6", + "G13": "D:7", + "G14": "D:7", + "G15": "D:8", + "G16": "D:8", + "G17": "D:9", + "G18": "D:9", + "G19": "D:10", + "G20": "D:10", + "G21": "D:11", + "G22": "D:11", + "G23": "D:12", + "G24": "D:12", + "H01": "D:1", + "H02": "Std:4", + "H03": "D:2", + "H04": "Std:4", + "H05": "D:3", + "H06": "Std:4", + "H07": "D:4", + "H08": "", + "H09": "D:5", + "H10": "", + "H11": "D:6", + "H12": "", + "H13": "D:7", + "H14": "", + "H15": "D:8", + "H16": "", + "H17": "D:9", + "H18": "", + "H19": "D:10", + "H20": "", + "H21": "D:11", + "H22": "", + "H23": "D:12", + "H24": "", + "I01": "E:1", + "I02": "E:1", + "I03": "E:2", + "I04": "E:2", + "I05": "E:3", + "I06": "E:3", + "I07": "E:4", + "I08": "E:4", + "I09": "E:5", + "I10": "E:5", + "I11": "E:6", + "I12": "E:6", + "I13": "E:7", + "I14": "E:7", + "I15": "E:8", + "I16": "E:8", + "I17": "E:9", + "I18": "E:9", + "I19": "E:10", + "I20": "E:10", + "I21": "E:11", + "I22": "E:11", + "I23": "E:12", + "I24": "E:12", + "J01": "E:1", + "J02": "Std5", + "J03": "E:2", + "J04": "Std5", + "J05": "E:3", + "J06": "Std5", + "J07": "E:4", + "J08": "", + "J09": "E:5", + "J10": "", + "J11": "E:6", + "J12": "", + "J13": "E:7", + "J14": "", + "J15": "E:8", + "J16": "", + "J17": "E:9", + "J18": "", + "J19": "E:10", + "J20": "", + "J21": "E:11", + "J22": "", + "J23": "E:12", + "J24": "", + "K01": "F:1", + "K02": "F:1", + "K03": "F:2", + "K04": "F:2", + "K05": "F:3", + "K06": "F:3", + "K07": "F:4", + "K08": "F:4", + "K09": "F:5", + "K10": "F:5", + "K11": "F:6", + "K12": "F:6", + "K13": "F:7", + "K14": "F:7", + "K15": "F:8", + "K16": "F:8", + "K17": "F:9", + "K18": "F:9", + "K19": "F:10", + "K20": "F:10", + "K21": "F:11", + "K22": "F:11", + "K23": "F:12", + "K24": "F:12", + "L01": "F:1", + "L02": "Std6", + "L03": "F:2", + "L04": "Std6", + "L05": "F:3", + "L06": "Std6", + "L07": "F:4", + "L08": "", + "L09": "F:5", + "L10": "", + "L11": "F:6", + "L12": "", + "L13": "F:7", + "L14": "", + "L15": "F:8", + "L16": "", + "L17": "F:9", + "L18": "", + "L19": "F:10", + "L20": "", + "L21": "F:11", + "L22": "", + "L23": "F:12", + "L24": "", + "M01": "G:1", + "M02": "G:1", + "M03": "G:2", + "M04": "G:2", + "M05": "G:3", + "M06": "G:3", + "M07": "G:4", + "M08": "G:4", + "M09": "G:5", + "M10": "G:5", + "M11": "G:6", + "M12": "G:6", + "M13": "G:7", + "M14": "G:7", + "M15": "G:8", + "M16": "G:8", + "M17": "G:9", + "M18": "G:9", + "M19": "G:10", + "M20": "G:10", + "M21": "G:11", + "M22": "G:11", + "M23": "G:12", + "M24": "G:1", + "N01": "G:1", + "N02": "NTC", + "N03": "G:2", + "N04": "NTC", + "N05": "G:3", + "N06": "NTC", + "N07": "G:4", + "N08": "", + "N09": "G:5", + "N10": "", + "N11": "G:6", + "N12": "", + "N13": "G:7", + "N14": "", + "N15": "G:8", + "N16": "", + "N17": "G:9", + "N18": "", + "N19": "G:10", + "N20": "", + "N21": "G:11", + "N22": "", + "N23": "G:12", + "N24": "", + "O01": "H:1", + "O02": "H:1", + "O03": "H:2", + "O04": "H:2", + "O05": "H:3", + "O06": "H:3", + "O07": "H:4", + "O08": "H:4", + "O09": "H:5", + "O10": "H:5", + "O11": "H:6", + "O12": "H:6", + "O13": "H:7", + "O14": "H:7", + "O15": "H:8", + "O16": "H:8", + "O17": "H:9", + "O18": "H:9", + "O19": "H:10", + "O20": "H:10", + "O21": "H:11", + "O22": "H:11", + "O23": "H:12", + "O24": "H:12", + "P01": "H:1", + "P02": "", + "P03": "H:2", + "P04": "", + "P05": "H:3", + "P06": "", + "P07": "H:4", + "P08": "", + "P09": "H:5", + "P10": "", + "P11": "H:6", + "P12": "", + "P13": "H:7", + "P14": "", + "P15": "H:8", + "P16": "", + "P17": "H:9", + "P18": "", + "P19": "H:10", + "P20": "", + "P21": "H:11", + "P22": "", + "P23": "H:12", + "P24": "", +} + AVERAGE_MOLECULAR_WEIGHT_DS_DNA = 615.96 AVERAGE_MOLECULAR_WEIGHT_DS_DNA_ENDS = 36.04 diff --git a/cg_lims/EPPs/udf/calculate/qpcr_concentration.py b/cg_lims/EPPs/udf/calculate/qpcr_concentration.py new file mode 100644 index 00000000..b0c8b70d --- /dev/null +++ b/cg_lims/EPPs/udf/calculate/qpcr_concentration.py @@ -0,0 +1,177 @@ +import logging +import sys +from pathlib import Path +from statistics import mean +from typing import Dict, List, Optional + +import click +import numpy as np +import pandas as pd +from cg_lims import options +from cg_lims.EPPs.udf.calculate.constants import WELL_TRANSFORMER +from cg_lims.exceptions import FailingQCError, LimsError, MissingFileError, MissingValueError +from cg_lims.get.artifacts import get_artifact_by_name, get_artifacts +from cg_lims.get.files import get_file_path +from genologics.entities import Artifact, Process + +LOG = logging.getLogger(__name__) + + +class WellValues: + def __init__(self, well): + self.well: str = well + self.sq: List[float] = [] + self.cq: List[float] = [] + self.viable_indexes: List[List[int]] = [] + self.artifact: Optional[Artifact] = None + + def add_values(self, sq_value: float, cq_value: float) -> None: + """Add Cq and SQ values to the well object.""" + self.sq.append(sq_value) + self.cq.append(cq_value) + + def connect_artifact(self, artifact: Artifact) -> None: + """Connect an artifact to the well object.""" + self.artifact: Artifact = artifact + + def _trim_outliers(self) -> str: + """Remove the largest outlier of the current Cq and SQ values.""" + outlier_index: int = get_index_of_biggest_outlier(values=self.cq) + removed_cq_value: float = self.cq.pop(outlier_index) + removed_sq_value: float = self.sq.pop(outlier_index) + return f"Removed outlier Cq value {removed_cq_value} and SQ value {removed_sq_value} from {self.artifact.samples[0].id}." + + def get_concentration(self, cq_threshold: float, size_bp: int) -> float: + """Return the concentration (M) of the well object, given a Cq difference threshold and fragment size.""" + cq_set_difference: float = get_max_difference(self.cq) + if cq_set_difference > cq_threshold: + message: str = ( + f" Cq value difference is too high between the replicates of sample {self.artifact.samples[0].id}.\n" + f"Difference: {cq_set_difference}\n" + f"Cq values: {self.cq}\n" + f"SQ values: {self.sq}\n" + ) + trim_log: str = self._trim_outliers() + message = message + trim_log + LOG.info(message) + return calculate_molar_concentration(sq_values=self.sq, size_bp=size_bp) + + def set_artifact_udfs( + self, concentration_threshold: float, replicate_threshold: float, size_bp: int + ) -> None: + """Calculate and set all UDFs and QC flags of the connected artifact.""" + molar_concentration: float = self.get_concentration( + cq_threshold=float(replicate_threshold), size_bp=int(size_bp) + ) + self.artifact.qc_flag = "PASSED" + self.artifact.udf["Size (bp)"] = int(size_bp) + self.artifact.udf["Concentration"] = molar_concentration + self.artifact.udf["Concentration (nM)"] = molar_concentration * 10**9 + if self.artifact.udf["Concentration (nM)"] < concentration_threshold: + self.artifact.qc_flag = "FAILED" + LOG.info( + f" Concentration of {self.artifact.samples[0].id} is {molar_concentration * 10**9} nM." + f" Setting QC flag to failed as this is below the threshold of {concentration_threshold} nM." + ) + self.artifact.put() + + +def parse_quantification_summary(summary_file: str) -> Dict[str, WellValues]: + """Parse Quantification Summary excel file and return python dict with + original wells as the keys and WellValues objects as the values.""" + df: pd.DataFrame = pd.read_excel(summary_file) + quantification_data: Dict = {} + for index, row in df.iterrows(): + well: str = row["Well"] + cq: float = round(row["Cq"], 3) + sq: float = row["SQ"] + if not (np.isnan(cq) or np.isnan(sq)): + orig_well: str = WELL_TRANSFORMER[well] + if orig_well not in quantification_data.keys(): + quantification_data[orig_well] = WellValues(well=orig_well) + quantification_data[orig_well].add_values(sq_value=sq, cq_value=cq) + return quantification_data + + +def calculate_molar_concentration(sq_values: List[float], size_bp: int): + """Calculate and return the molar concentration given a list of SQ values and a fragment size.""" + return (mean(sq_values) * 10**4) * (452 / size_bp) + + +def get_index_of_biggest_outlier(values: List[float]) -> int: + """Return the index of the largest outlier in the given list of values.""" + mean_value: float = mean(values) + dev_from_mean: List[float] = [abs(value - mean_value) for value in values] + max_dev: float = max(dev_from_mean) + return dev_from_mean.index(max_dev) + + +def get_max_difference(values: List[float]) -> float: + """Return the difference between the largest and smallest values in a given list of values.""" + return max(values) - min(values) + + +@click.command() +@options.file_placeholder(help="qPCR result file placeholder name.") +@options.local_file() +@options.replicate_threshold() +@options.concentration_threshold() +@options.size_bp() +@click.pass_context +def qpcr_concentration( + ctx, + file: str, + local_file: str, + replicate_threshold: str, + concentration_threshold: str, + size_bp: str, +) -> None: + """Script for calculating qPCR dilutions. Requires an input qPCR result file and produces an output log file.""" + + LOG.info(f"Running {ctx.command_path} with params: {ctx.params}") + process: Process = ctx.obj["process"] + + if local_file: + file_path: str = local_file + else: + file_art: Artifact = get_artifact_by_name(process=process, name=file) + file_path: str = get_file_path(file_art) + + if not Path(file_path).is_file(): + raise MissingFileError(f"No such file: {file_path}") + + try: + artifacts: List[Artifact] = get_artifacts(process=process, measurement=True) + quantification_data: Dict[str, WellValues] = parse_quantification_summary( + summary_file=file_path + ) + failed_samples: int = 0 + for artifact in artifacts: + artifact_well: str = artifact.location[1] + if artifact_well not in quantification_data.keys(): + raise MissingValueError( + f"No values found for well {artifact_well} in the result file! " + f"Please double check if sample {artifact.samples[0].id} has been placed correctly." + ) + well_results: WellValues = quantification_data[artifact.location[1]] + well_results.connect_artifact(artifact=artifact) + well_results.set_artifact_udfs( + concentration_threshold=float(concentration_threshold), + replicate_threshold=float(replicate_threshold), + size_bp=int(size_bp), + ) + if well_results.artifact.qc_flag == "FAILED": + failed_samples += 1 + + if failed_samples: + error_message: str = ( + f" {failed_samples} sample(s) failed the QC! See the logs for further information." + ) + raise FailingQCError(error_message) + + message: str = " Concentrations have been calculated and set for all samples!" + LOG.info(message) + click.echo(message) + except LimsError as e: + LOG.error(e.message) + sys.exit(e.message) diff --git a/cg_lims/options.py b/cg_lims/options.py index 6ba4b854..c1d9167e 100644 --- a/cg_lims/options.py +++ b/cg_lims/options.py @@ -383,6 +383,18 @@ def udf_values( ) +def replicate_threshold(help: str = "Threshold for replicate difference.") -> click.option: + return click.option("--replicate-threshold", required=True, help=help) + + +def concentration_threshold(help: str = "Threshold for concentrations.") -> click.option: + return click.option("--concentration-threshold", required=True, help=help) + + +def size_bp(help: str = "Fragment size in base pairs.") -> click.option: + return click.option("--size-bp", required=True, help=help) + + def root_path( help: str = "Root path to be used by the script to find files.", ) -> click.option: @@ -433,3 +445,4 @@ def well_udf(help: str = "UDF name for artifact well.") -> click.option: def container_name_udf(help: str = "UDF name for container name.") -> click.option: return click.option("--container-name-udf", required=False, default=None, help=help) + diff --git a/requirements-dev.txt b/requirements-dev.txt index 79600ab0..ab2300b3 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,3 +7,4 @@ mock coloredlogs responses limsmock +openpyxl diff --git a/requirements.txt b/requirements.txt index 91ef5ebf..a5389226 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ uvicorn matplotlib pymongo coloredlogs +openpyxl diff --git a/tests/EPPs/udf/calculate/test_qpcr_concentration.py b/tests/EPPs/udf/calculate/test_qpcr_concentration.py new file mode 100644 index 00000000..5fecd002 --- /dev/null +++ b/tests/EPPs/udf/calculate/test_qpcr_concentration.py @@ -0,0 +1,76 @@ +from pathlib import Path +from typing import List + +import click +import pytest +from cg_lims.EPPs.udf.calculate.qpcr_concentration import ( + WellValues, + calculate_molar_concentration, + get_index_of_biggest_outlier, + get_max_difference, + parse_quantification_summary, + qpcr_concentration, +) +from cg_lims.exceptions import FileError, MissingFileError +from genologics.entities import Artifact, Process +from tests.conftest import server + + +@pytest.mark.parametrize( + "sq_values, size_bp, concentration", + [ + ([1.10e-12, 1.44e-12, 1.09e-12], 470, 1.1637e-08), + ([1.41e-12, 1.37e-12, 1.51e-12], 470, 1.3752e-08), + ([1.37e-12, 1.48e-12, 1.54e-12], 470, 1.4073e-08), + ([1.34e-12, 1.20e-12, 1.30e-12], 470, 1.231e-08), + ], +) +def test_calculate_molar_concentration(sq_values: List[float], size_bp: int, concentration: float): + """Test to verify that molar concentrations are accurately calculated given a set if different SQ values.""" + # GIVEN a set of SQ values of differing values + + # WHEN calculating the molar concentration with calculate_molar_concentration + result = calculate_molar_concentration(sq_values=sq_values, size_bp=size_bp) + + # THEN the correct value is returned + assert round(result, 12) == concentration + + +@pytest.mark.parametrize( + "values, biggest_index", + [ + ([1.10e-12, 1.44e-12, 1.09e-12], 1), + ([10, 7, 6, 8, 5], 0), + ([0, 100, -100, 123, 454], 4), + ([-18, -12, -10], 0), + ], +) +def test_get_index_of_biggest_outlier(values: List[float], biggest_index: int): + """Test to verify that that the biggest outlier can be identified by the function get_index_of_biggest_outlier""" + # GIVEN a list of values with differing lengths + + # WHEN running get_index_of_biggest_outlier + result = get_index_of_biggest_outlier(values=values) + + # THEN the index of the largest entry is returned + assert result == biggest_index + + +@pytest.mark.parametrize( + "values, difference", + [ + ([1.0e-2, 1.1e-2, 1.2e-2], 0.002), + ([10, 7, 6, 8, 5], 5), + ([0, 100, -100, 123, 454], 554), + ([-18, -12, -10], 8), + ], +) +def test_get_max_difference(values: List[float], difference: float): + """Test to verify that the get_max_difference function properly works.""" + # GIVEN a list of values with differing lengths + + # WHEN running get_index_of_biggest_outlier + result = get_max_difference(values=values) + + # THEN the index of the largest entry is returned + assert result == difference diff --git a/tests/conftest.py b/tests/conftest.py index f3540332..b3c6b135 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,7 +7,6 @@ import pytest from cg_lims.EPPs.qc.sequencing_artifact_manager import SequencingArtifactManager from cg_lims.EPPs.qc.sequencing_quality_checker import SequencingQualityChecker -from cg_lims.models.sample_lane_sequencing_metrics import SampleLaneSequencingMetrics from cg_lims.status_db_api import StatusDBAPI from cg_lims.token_manager import TokenManager from click.testing import CliRunner @@ -354,7 +353,7 @@ def novaseq_metrics_missing_for_sample_in_lane( missing_sample_id, missing_lane, ) -> List[Dict]: - metrics: List[SampleLaneSequencingMetrics] = generate_metrics_json( + metrics: List[Dict] = generate_metrics_json( flow_cell_name=novaseq_flow_cell_name, sample_ids=novaseq_sample_ids, lanes=novaseq_lanes, @@ -379,7 +378,7 @@ def novaseq_missing_sample( ) -> List[Dict]: novaseq_sample_ids.append(sample_id_missing_in_lims) - metrics: List[SampleLaneSequencingMetrics] = generate_metrics_json( + metrics: List[Dict] = generate_metrics_json( flow_cell_name=novaseq_flow_cell_name, sample_ids=novaseq_sample_ids, lanes=novaseq_lanes,