From 3af5da96d1dc68ba056e015c95842ce27d0eea5a Mon Sep 17 00:00:00 2001 From: Daniel Cameron Date: Tue, 20 Feb 2018 17:22:15 +1100 Subject: [PATCH] Metrics no longer counting duplicate reads by default Fixes bug with low RP qual scores for libraries with high duplication rates. --- pom.xml | 2 +- ...FastEmpiricalReferenceLikelihoodModel.java | 2 +- src/main/java/gridss/ComputeCoverage.java | 1 + src/main/java/gridss/ExtractSVReads.java | 1 + .../gridss/analysis/CollectCigarMetrics.java | 4 +++ .../analysis/CollectFragmentGCMetrics.java | 7 ++--- .../gridss/analysis/CollectIdsvMetrics.java | 29 ++++--------------- .../gridss/analysis/CollectMapqMetrics.java | 28 +++--------------- .../CollectStructuralVariantReadMetrics.java | 8 +++++ .../gridss/analysis/CollectTagMetrics.java | 5 ++++ .../cmdline/GcSinglePassSamProgram.java | 4 +-- ...ucturalVariantReadsCommandLineProgram.java | 2 ++ 12 files changed, 36 insertions(+), 57 deletions(-) diff --git a/pom.xml b/pom.xml index 3cf738669..80ab82d68 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ au.edu.wehi gridss jar - 1.5.0 + 1.5.1 gridss https://github.com/PapenfussLab/gridss diff --git a/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java b/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java index e4aa414d4..81234c7ce 100644 --- a/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java +++ b/src/main/java/au/edu/wehi/idsv/model/FastEmpiricalReferenceLikelihoodModel.java @@ -47,7 +47,7 @@ public double scoreReadPair(IdsvSamFileMetrics metrics, int fragmentSize, int ma public double scoreUnmappedMate(IdsvSamFileMetrics metrics, int mapq) { IdsvMetrics im = metrics.getIdsvMetrics(); // completely unmapped read pairs are excluded for consistency with sc and dp calculation - double score = MathUtil.prToPhred((double)im.READ_PAIRS_ONE_MAPPED / (double)(im.READ_PAIRS - im.READ_PAIRS_ZERO_MAPPED)); + double score = MathUtil.prToPhred((double)im.READ_PAIRS_ONE_MAPPED / (double)(im.MAPPED_READS)); score = Math.min(score, mapq); return score; } diff --git a/src/main/java/gridss/ComputeCoverage.java b/src/main/java/gridss/ComputeCoverage.java index ee13c9ee9..000267c84 100644 --- a/src/main/java/gridss/ComputeCoverage.java +++ b/src/main/java/gridss/ComputeCoverage.java @@ -93,6 +93,7 @@ private IntervalCoverageAccumulator initIntervalCoverageAccumulator() { } @Override protected void acceptRead(SAMRecord record, ReferenceSequence refSeq) { + if (record.getDuplicateReadFlag() && !INCLUDE_DUPLICATES) return; ReadGcSummary gc = new ReadGcSummary(record, refSeq, UNPAIRED_FRAGMENT_SIZE, getReadPairConcordanceCalculator()); if (ica_gc != null) { ica_gc.add(record, gc, gcAdjust.adjustmentMultiplier((int)gc.gcPercentage)); diff --git a/src/main/java/gridss/ExtractSVReads.java b/src/main/java/gridss/ExtractSVReads.java index 770172a17..9d6eb19a8 100644 --- a/src/main/java/gridss/ExtractSVReads.java +++ b/src/main/java/gridss/ExtractSVReads.java @@ -191,6 +191,7 @@ public boolean[] shouldExtract(List records, ReferenceLookup lookup) extract[i] = !hasConsistentReadAlignment[SAMRecordUtil.getSegmentIndex(r)] && !readfilter.filterOut(r); // supp records should use the primary alignment when considering concordance extract[i] |= !hasConsistentReadPair && !pairfilter.filterOut(primaryAlignmentForSupplementary(r)); + extract[i] &= (!r.getDuplicateReadFlag() || INCLUDE_DUPLICATES); } return extract; } diff --git a/src/main/java/gridss/analysis/CollectCigarMetrics.java b/src/main/java/gridss/analysis/CollectCigarMetrics.java index 8ed573cf3..7f642361c 100644 --- a/src/main/java/gridss/analysis/CollectCigarMetrics.java +++ b/src/main/java/gridss/analysis/CollectCigarMetrics.java @@ -57,6 +57,9 @@ public class CollectCigarMetrics extends SinglePassSamProgram { @Argument(shortName="Z", doc="If set to true include a zero length operator for each operator not included in the alignment CIGAR.") public boolean INCLUDE_OMITTED_OPERATORS = true; + @Argument(doc="If true, also include reads marked as duplicates.") + public boolean INCLUDE_DUPLICATES = false; + private EnumMap> cigar; /** Required main method. */ @@ -79,6 +82,7 @@ protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) { // Skip unwanted records if (rec.getReadUnmappedFlag()) return; if (rec.getCigar() == null) return; + if (rec.getDuplicateReadFlag() && !INCLUDE_DUPLICATES) return; List list = rec.getCigar().getCigarElements(); if (list == null || list.size() == 0) return; for (CigarElement ce : list) { diff --git a/src/main/java/gridss/analysis/CollectFragmentGCMetrics.java b/src/main/java/gridss/analysis/CollectFragmentGCMetrics.java index e6ff0207e..2f3e13ade 100644 --- a/src/main/java/gridss/analysis/CollectFragmentGCMetrics.java +++ b/src/main/java/gridss/analysis/CollectFragmentGCMetrics.java @@ -78,11 +78,8 @@ public class CollectFragmentGCMetrics extends GcSinglePassSamProgram { } @Override protected void acceptRead(final SAMRecord record, final ReferenceSequence ref) { - if (record.getDuplicateReadFlag() && IGNORE_DUPLICATES) { - // ignore duplicates - } else { - multiCollector.acceptRecord(record, ref); - } + if (record.getDuplicateReadFlag() && !INCLUDE_DUPLICATES) return; + multiCollector.acceptRecord(record, ref); } @Override protected void finish() { diff --git a/src/main/java/gridss/analysis/CollectIdsvMetrics.java b/src/main/java/gridss/analysis/CollectIdsvMetrics.java index ef489153b..14d619ba1 100644 --- a/src/main/java/gridss/analysis/CollectIdsvMetrics.java +++ b/src/main/java/gridss/analysis/CollectIdsvMetrics.java @@ -1,31 +1,8 @@ -/* - * The MIT License - * - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - package gridss.analysis; import java.io.File; +import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import au.edu.wehi.idsv.sam.SAMRecordUtil; @@ -48,6 +25,9 @@ public class CollectIdsvMetrics extends SinglePassSamProgram { public static final String METRICS_SUFFIX = ".idsv_metrics"; + @Argument(doc="If true, also include reads marked as duplicates.") + public boolean INCLUDE_DUPLICATES = false; + private IdsvMetrics idsv; /** Required main method. */ @@ -63,6 +43,7 @@ public void setup(final SAMFileHeader header, final File samFile) { @Override public void acceptRead(final SAMRecord record, final ReferenceSequence ref) { + if (record.getDuplicateReadFlag() && !INCLUDE_DUPLICATES) return; idsv.MAX_READ_LENGTH = Math.max(idsv.MAX_READ_LENGTH, record.getReadLength()); if (!record.getReadUnmappedFlag()) { idsv.MAX_READ_MAPPED_LENGTH = Math.max(idsv.MAX_READ_MAPPED_LENGTH, record.getAlignmentEnd() - record.getAlignmentStart() + 1); diff --git a/src/main/java/gridss/analysis/CollectMapqMetrics.java b/src/main/java/gridss/analysis/CollectMapqMetrics.java index 2a7a18a58..8cfbb1087 100644 --- a/src/main/java/gridss/analysis/CollectMapqMetrics.java +++ b/src/main/java/gridss/analysis/CollectMapqMetrics.java @@ -1,27 +1,3 @@ -/* - * The MIT License - * - * Copyright (c) 2016 Daniel Cameron - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - package gridss.analysis; import java.io.File; @@ -59,6 +35,9 @@ public class CollectMapqMetrics extends SinglePassSamProgram { public static final String METRICS_SUFFIX = ".mapq_metrics"; public static final String HISTOGRAM_SUFFIX = ".mapq_histogram.pdf"; private static final String Histogram_R_SCRIPT = "gridss/analysis/mapqHistogram.R"; + + @Argument(doc="If true, also include reads marked as duplicates.") + public boolean INCLUDE_DUPLICATES = false; @Argument(shortName="H", doc="File to write insert size Histogram chart to.") public File Histogram_FILE = null; @@ -98,6 +77,7 @@ protected String[] customCommandLineValidation() { } @Override protected void acceptRead(final SAMRecord record, final ReferenceSequence ref) { + if (record.getDuplicateReadFlag() && !INCLUDE_DUPLICATES) return; multiCollector.acceptRecord(record, ref); } diff --git a/src/main/java/gridss/analysis/CollectStructuralVariantReadMetrics.java b/src/main/java/gridss/analysis/CollectStructuralVariantReadMetrics.java index cfedf6a21..34b0566e9 100644 --- a/src/main/java/gridss/analysis/CollectStructuralVariantReadMetrics.java +++ b/src/main/java/gridss/analysis/CollectStructuralVariantReadMetrics.java @@ -27,6 +27,7 @@ ) public class CollectStructuralVariantReadMetrics extends ProcessStructuralVariantReadsCommandLineProgram { public static final String METRICS_SUFFIX = ".sv_metrics"; + //private static final Log log = Log.getInstance(CollectStructuralVariantReadMetrics.class); public static void main(String[] argv) { System.exit(new CollectStructuralVariantReadMetrics().instanceMain(argv)); @@ -50,6 +51,13 @@ public void setup(SAMFileHeader header, File samFile) { } @Override public void acceptFragment(List records, ReferenceLookup lookup) { + if (!INCLUDE_DUPLICATES) { + for (int i = records.size() - 1; i >= 0; i--) { + if (records.get(i).getDuplicateReadFlag()) { + records.remove(i); + } + } + } boolean hasConsistentReadPair = ExtractSVReads.hasReadPairingConsistentWithReference(getReadPairConcordanceCalculator(), records); boolean[] hasConsistentReadAlignment = ExtractSVReads.hasReadAlignmentConsistentWithReference(records); boolean hasOeaAnchor = false; diff --git a/src/main/java/gridss/analysis/CollectTagMetrics.java b/src/main/java/gridss/analysis/CollectTagMetrics.java index d22316354..7b028b667 100644 --- a/src/main/java/gridss/analysis/CollectTagMetrics.java +++ b/src/main/java/gridss/analysis/CollectTagMetrics.java @@ -29,6 +29,7 @@ import java.util.HashMap; import java.util.Map; +import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import htsjdk.samtools.SAMFileHeader; @@ -49,6 +50,9 @@ public class CollectTagMetrics extends SinglePassSamProgram { public static final String METRICS_SUFFIX = ".tag_metrics"; + @Argument(doc="If true, also include reads marked as duplicates.") + public boolean INCLUDE_DUPLICATES = false; + private Map tags = new HashMap<>(); /** Required main method. */ @@ -64,6 +68,7 @@ protected void setup(final SAMFileHeader header, final File samFile) { @Override protected void acceptRead(final SAMRecord rec, final ReferenceSequence ref) { + if (rec.getDuplicateReadFlag() && !INCLUDE_DUPLICATES) return; for (SAMTagAndValue attr : rec.getAttributes()) { String tag = attr.tag; TagSummaryMetrics metric = tags.get(tag); diff --git a/src/main/java/gridss/cmdline/GcSinglePassSamProgram.java b/src/main/java/gridss/cmdline/GcSinglePassSamProgram.java index 02cc0308b..19460f149 100644 --- a/src/main/java/gridss/cmdline/GcSinglePassSamProgram.java +++ b/src/main/java/gridss/cmdline/GcSinglePassSamProgram.java @@ -86,8 +86,8 @@ public ReadPairConcordanceCalculator getReadPairConcordanceCalculator() { } // --------- end chunk from ProcessStructuralVariantReadsCommandLineProgram --------- // --------- start chunk from ReferenceCommandLineProgram --------- - @Argument(doc = "Ignore reads marked as duplicates.", optional = true) - public boolean IGNORE_DUPLICATES = true; + @Argument(doc="If true, also include reads marked as duplicates.") + public boolean INCLUDE_DUPLICATES = false; private ReferenceLookup reference; public ReferenceLookup getReference() { IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE); diff --git a/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java b/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java index ec45294cf..e2e3a8019 100644 --- a/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java +++ b/src/main/java/gridss/cmdline/ProcessStructuralVariantReadsCommandLineProgram.java @@ -44,6 +44,8 @@ public abstract class ProcessStructuralVariantReadsCommandLineProgram extends By public File INSERT_SIZE_METRICS = null; @Argument(doc="Include unmapped reads", optional=true) public boolean UNMAPPED_READS = true; + @Argument(doc="If true, also include reads marked as duplicates.") + public boolean INCLUDE_DUPLICATES = false; @Override protected String[] customCommandLineValidation() { if (READ_PAIR_CONCORDANCE_METHOD == ReadPairConcordanceMethod.PERCENTAGE && INSERT_SIZE_METRICS == null) {