Skip to content

Commit

Permalink
formats: add FastQC parser, #187
Browse files Browse the repository at this point in the history
  • Loading branch information
jtarraga committed Jun 24, 2020
1 parent e9252c5 commit 183872e
Showing 1 changed file with 274 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
package org.opencb.biodata.formats.sequence.fastqc.io;

import org.opencb.biodata.formats.sequence.fastqc.*;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import java.util.Map;

public class FastQcParser {

public static FastQc parse(File file) throws IOException {
FastQc fastQc = new FastQc();

FileReader fr = new FileReader(file);

BufferedReader br = new BufferedReader(fr);

// Skip first line
br.readLine();

String line;

while ((line = br.readLine()) != null) {
if (line.startsWith(">>")) {
String status = line.split("\t")[1].toUpperCase();
if (line.startsWith(">>Basic Statistics")) {
fastQc.getSummary().setBasicStatistics(status);
parseBasicStatistics(fastQc.getBasicStats(), br);
} else if (line.startsWith(">>Per base sequence quality")) {
fastQc.getSummary().setPerBaseSeqQuality(status);
parsePerBaseSeqQuality(fastQc.getPerBaseSeqQualities(), br);
} else if (line.startsWith(">>Per tile sequence quality")) {
fastQc.getSummary().setPerTileSeqQuality(status);
parsePerTileSeqQuality(fastQc.getPerTileSeqQualities(), br);
} else if (line.startsWith(">>Per sequence quality scores")) {
fastQc.getSummary().setPerSeqQualityScores(status);
parsePerSeqQualityScores(fastQc.getPerSeqQualityScores(), br);
} else if (line.startsWith(">>Per base sequence content")) {
fastQc.getSummary().setPerBaseSeqContent(status);
parsePerBaseSeqContent(fastQc.getPerBaseSeqContent(), br);
} else if (line.startsWith(">>Per sequence GC content")) {
fastQc.getSummary().setPerSeqGcContent(status);
parsePerSeqGcContent(fastQc.getPerSeqGcContent(), br);
} else if (line.startsWith(">>Per base N content")) {
fastQc.getSummary().setPerBaseNContent(status);
parsePerBaseNContent(fastQc.getPerBaseNContent(), br);
} else if (line.startsWith(">>Sequence Length Distribution")) {
fastQc.getSummary().setSeqLengthDistribution(status);
parseSeqLengthDistribution(fastQc.getSeqLengthDistribution(), br);
} else if (line.startsWith(">>Sequence Duplication Levels")) {
fastQc.getSummary().setSeqDuplicationLevels(status);
parseSeqDuplicationLevels(fastQc.getSeqDuplicationLevels(), br);
} else if (line.startsWith(">>Overrepresented sequences")) {
fastQc.getSummary().setOverrepresentedSeqs(status);
parseOverrepresentedSeqs(fastQc.getOverrepresentedSeqs(), br);
} else if (line.startsWith(">>Adapter Content")) {
fastQc.getSummary().setAdapterContent(status);
parseAdapterContent(fastQc.getAdapterContent(), br);
} else if (line.startsWith(">>Kmer Content")) {
fastQc.getSummary().setKmerContent(status);
parseKmerContent(fastQc.getKmerContent(), br);
}
}
}
fr.close();

return fastQc;

}

private static void parseKmerContent(List<KmerContent> kmerContent, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Sequence Count PValue Obs/Exp Max Max Obs/Exp Position
kmerContent.add(new KmerContent(fields[0], Integer.parseInt(fields[1]), Double.parseDouble(fields[2]),
Double.parseDouble(fields[3]), fields[4]));
}
}

private static void parseAdapterContent(List<AdapterContent> adapterContent, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Position Illumina Universal Adapter Illumina Small RNA 3' Adapter Illumina Small RNA 5' Adapter Nextera Transposase Sequence SOLID S
// mall RNA Adapter
adapterContent.add(new AdapterContent(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]),
Double.parseDouble(fields[3]), Double.parseDouble(fields[4]), Double.parseDouble(fields[5])));
}

}

private static void parseOverrepresentedSeqs(List<OverrepresentedSeq> overrepresentedSeqs, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Sequence Count Percentage Possible Source
overrepresentedSeqs.add(new OverrepresentedSeq(fields[0], Integer.parseInt(fields[1]), Double.parseDouble(fields[2]),
fields[3]));
}
}

private static void parseSeqDuplicationLevels(List<SeqDuplicationLevel> seqDuplicationLevels, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Duplication Level Percentage of deduplicated Percentage of total
seqDuplicationLevels.add(new SeqDuplicationLevel(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2])));
}
}

private static void parseSeqLengthDistribution(Map<Integer, Double> seqLengthDistribution, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Length Count
seqLengthDistribution.put(Integer.parseInt(fields[0]), Double.parseDouble(fields[1]));
}
}

private static void parsePerBaseNContent(Map<String, Double> perBaseNContent, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Base N-Count
perBaseNContent.put(fields[0], Double.parseDouble(fields[1]));
}
}

private static void parsePerSeqGcContent(double[] perSeqGcContent, BufferedReader br) throws IOException {
int i = 0;
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #GC Content Count
perSeqGcContent[i++] = Double.parseDouble(fields[1]);
}
}

private static void parsePerBaseSeqContent(List<PerBaseSeqContent> perBaseSeqContent, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Base G A T C
perBaseSeqContent.add(new PerBaseSeqContent(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]),
Double.parseDouble(fields[3]), Double.parseDouble(fields[4])));
}
}

private static void parsePerTileSeqQuality(List<PerTileSeqQuality> perTileSeqQualities, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Tile Base Mean
perTileSeqQualities.add(new PerTileSeqQuality(fields[0], fields[1], Double.parseDouble(fields[2])));
}
}

private static void parsePerSeqQualityScores(Map<Integer, Double> perSeqQualityScores, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Quality Count
perSeqQualityScores.put(Integer.parseInt(fields[0]), Double.parseDouble(fields[1]));
}
}

private static void parsePerBaseSeqQuality(List<PerBaseSeqQuality> perBaseSequenceQualities, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
// #Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile
perBaseSequenceQualities.add(new PerBaseSeqQuality(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]),
Double.parseDouble(fields[3]), Double.parseDouble(fields[4]), Double.parseDouble(fields[5]),
Double.parseDouble(fields[6])));
}
}

private static void parseBasicStatistics(Map<String, String> basicStats, BufferedReader br) throws IOException {
String line;
while ((line = br.readLine()) != null) {
if (line.startsWith("#")) {
continue;
}
if (line.startsWith(">>END_MODULE")) {
return;
}

String[] fields = line.split("\t");
basicStats.put(fields[0], fields[1]);
}
}

}

0 comments on commit 183872e

Please sign in to comment.