From 183872eb896834eca358691ef03e8ed1d6eba3a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Wed, 24 Jun 2020 08:43:24 +0200 Subject: [PATCH] formats: add FastQC parser, #187 --- .../sequence/fastqc/io/FastQcParser.java | 274 ++++++++++++++++++ 1 file changed, 274 insertions(+) create mode 100644 biodata-formats/src/main/java/org/opencb/biodata/formats/sequence/fastqc/io/FastQcParser.java diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/sequence/fastqc/io/FastQcParser.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/sequence/fastqc/io/FastQcParser.java new file mode 100644 index 000000000..a92b723f0 --- /dev/null +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/sequence/fastqc/io/FastQcParser.java @@ -0,0 +1,274 @@ +package org.opencb.biodata.formats.sequence.fastqc.io; + +import org.opencb.biodata.formats.sequence.fastqc.*; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.List; +import java.util.Map; + +public class FastQcParser { + + public static FastQc parse(File file) throws IOException { + FastQc fastQc = new FastQc(); + + FileReader fr = new FileReader(file); + + BufferedReader br = new BufferedReader(fr); + + // Skip first line + br.readLine(); + + String line; + + while ((line = br.readLine()) != null) { + if (line.startsWith(">>")) { + String status = line.split("\t")[1].toUpperCase(); + if (line.startsWith(">>Basic Statistics")) { + fastQc.getSummary().setBasicStatistics(status); + parseBasicStatistics(fastQc.getBasicStats(), br); + } else if (line.startsWith(">>Per base sequence quality")) { + fastQc.getSummary().setPerBaseSeqQuality(status); + parsePerBaseSeqQuality(fastQc.getPerBaseSeqQualities(), br); + } else if (line.startsWith(">>Per tile sequence quality")) { + fastQc.getSummary().setPerTileSeqQuality(status); + parsePerTileSeqQuality(fastQc.getPerTileSeqQualities(), br); + } else if (line.startsWith(">>Per sequence quality scores")) { + fastQc.getSummary().setPerSeqQualityScores(status); + parsePerSeqQualityScores(fastQc.getPerSeqQualityScores(), br); + } else if (line.startsWith(">>Per base sequence content")) { + fastQc.getSummary().setPerBaseSeqContent(status); + parsePerBaseSeqContent(fastQc.getPerBaseSeqContent(), br); + } else if (line.startsWith(">>Per sequence GC content")) { + fastQc.getSummary().setPerSeqGcContent(status); + parsePerSeqGcContent(fastQc.getPerSeqGcContent(), br); + } else if (line.startsWith(">>Per base N content")) { + fastQc.getSummary().setPerBaseNContent(status); + parsePerBaseNContent(fastQc.getPerBaseNContent(), br); + } else if (line.startsWith(">>Sequence Length Distribution")) { + fastQc.getSummary().setSeqLengthDistribution(status); + parseSeqLengthDistribution(fastQc.getSeqLengthDistribution(), br); + } else if (line.startsWith(">>Sequence Duplication Levels")) { + fastQc.getSummary().setSeqDuplicationLevels(status); + parseSeqDuplicationLevels(fastQc.getSeqDuplicationLevels(), br); + } else if (line.startsWith(">>Overrepresented sequences")) { + fastQc.getSummary().setOverrepresentedSeqs(status); + parseOverrepresentedSeqs(fastQc.getOverrepresentedSeqs(), br); + } else if (line.startsWith(">>Adapter Content")) { + fastQc.getSummary().setAdapterContent(status); + parseAdapterContent(fastQc.getAdapterContent(), br); + } else if (line.startsWith(">>Kmer Content")) { + fastQc.getSummary().setKmerContent(status); + parseKmerContent(fastQc.getKmerContent(), br); + } + } + } + fr.close(); + + return fastQc; + + } + + private static void parseKmerContent(List kmerContent, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Sequence Count PValue Obs/Exp Max Max Obs/Exp Position + kmerContent.add(new KmerContent(fields[0], Integer.parseInt(fields[1]), Double.parseDouble(fields[2]), + Double.parseDouble(fields[3]), fields[4])); + } + } + + private static void parseAdapterContent(List adapterContent, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Position Illumina Universal Adapter Illumina Small RNA 3' Adapter Illumina Small RNA 5' Adapter Nextera Transposase Sequence SOLID S + // mall RNA Adapter + adapterContent.add(new AdapterContent(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]), + Double.parseDouble(fields[3]), Double.parseDouble(fields[4]), Double.parseDouble(fields[5]))); + } + + } + + private static void parseOverrepresentedSeqs(List overrepresentedSeqs, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Sequence Count Percentage Possible Source + overrepresentedSeqs.add(new OverrepresentedSeq(fields[0], Integer.parseInt(fields[1]), Double.parseDouble(fields[2]), + fields[3])); + } + } + + private static void parseSeqDuplicationLevels(List seqDuplicationLevels, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Duplication Level Percentage of deduplicated Percentage of total + seqDuplicationLevels.add(new SeqDuplicationLevel(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]))); + } + } + + private static void parseSeqLengthDistribution(Map seqLengthDistribution, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Length Count + seqLengthDistribution.put(Integer.parseInt(fields[0]), Double.parseDouble(fields[1])); + } + } + + private static void parsePerBaseNContent(Map perBaseNContent, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Base N-Count + perBaseNContent.put(fields[0], Double.parseDouble(fields[1])); + } + } + + private static void parsePerSeqGcContent(double[] perSeqGcContent, BufferedReader br) throws IOException { + int i = 0; + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #GC Content Count + perSeqGcContent[i++] = Double.parseDouble(fields[1]); + } + } + + private static void parsePerBaseSeqContent(List perBaseSeqContent, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Base G A T C + perBaseSeqContent.add(new PerBaseSeqContent(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]), + Double.parseDouble(fields[3]), Double.parseDouble(fields[4]))); + } + } + + private static void parsePerTileSeqQuality(List perTileSeqQualities, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Tile Base Mean + perTileSeqQualities.add(new PerTileSeqQuality(fields[0], fields[1], Double.parseDouble(fields[2]))); + } + } + + private static void parsePerSeqQualityScores(Map perSeqQualityScores, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Quality Count + perSeqQualityScores.put(Integer.parseInt(fields[0]), Double.parseDouble(fields[1])); + } + } + + private static void parsePerBaseSeqQuality(List perBaseSequenceQualities, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + // #Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile + perBaseSequenceQualities.add(new PerBaseSeqQuality(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]), + Double.parseDouble(fields[3]), Double.parseDouble(fields[4]), Double.parseDouble(fields[5]), + Double.parseDouble(fields[6]))); + } + } + + private static void parseBasicStatistics(Map basicStats, BufferedReader br) throws IOException { + String line; + while ((line = br.readLine()) != null) { + if (line.startsWith("#")) { + continue; + } + if (line.startsWith(">>END_MODULE")) { + return; + } + + String[] fields = line.split("\t"); + basicStats.put(fields[0], fields[1]); + } + } + +}