-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
274 additions
and
0 deletions.
There are no files selected for viewing
274 changes: 274 additions & 0 deletions
274
...ata-formats/src/main/java/org/opencb/biodata/formats/sequence/fastqc/io/FastQcParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,274 @@ | ||
package org.opencb.biodata.formats.sequence.fastqc.io; | ||
|
||
import org.opencb.biodata.formats.sequence.fastqc.*; | ||
|
||
import java.io.BufferedReader; | ||
import java.io.File; | ||
import java.io.FileReader; | ||
import java.io.IOException; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
public class FastQcParser { | ||
|
||
public static FastQc parse(File file) throws IOException { | ||
FastQc fastQc = new FastQc(); | ||
|
||
FileReader fr = new FileReader(file); | ||
|
||
BufferedReader br = new BufferedReader(fr); | ||
|
||
// Skip first line | ||
br.readLine(); | ||
|
||
String line; | ||
|
||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith(">>")) { | ||
String status = line.split("\t")[1].toUpperCase(); | ||
if (line.startsWith(">>Basic Statistics")) { | ||
fastQc.getSummary().setBasicStatistics(status); | ||
parseBasicStatistics(fastQc.getBasicStats(), br); | ||
} else if (line.startsWith(">>Per base sequence quality")) { | ||
fastQc.getSummary().setPerBaseSeqQuality(status); | ||
parsePerBaseSeqQuality(fastQc.getPerBaseSeqQualities(), br); | ||
} else if (line.startsWith(">>Per tile sequence quality")) { | ||
fastQc.getSummary().setPerTileSeqQuality(status); | ||
parsePerTileSeqQuality(fastQc.getPerTileSeqQualities(), br); | ||
} else if (line.startsWith(">>Per sequence quality scores")) { | ||
fastQc.getSummary().setPerSeqQualityScores(status); | ||
parsePerSeqQualityScores(fastQc.getPerSeqQualityScores(), br); | ||
} else if (line.startsWith(">>Per base sequence content")) { | ||
fastQc.getSummary().setPerBaseSeqContent(status); | ||
parsePerBaseSeqContent(fastQc.getPerBaseSeqContent(), br); | ||
} else if (line.startsWith(">>Per sequence GC content")) { | ||
fastQc.getSummary().setPerSeqGcContent(status); | ||
parsePerSeqGcContent(fastQc.getPerSeqGcContent(), br); | ||
} else if (line.startsWith(">>Per base N content")) { | ||
fastQc.getSummary().setPerBaseNContent(status); | ||
parsePerBaseNContent(fastQc.getPerBaseNContent(), br); | ||
} else if (line.startsWith(">>Sequence Length Distribution")) { | ||
fastQc.getSummary().setSeqLengthDistribution(status); | ||
parseSeqLengthDistribution(fastQc.getSeqLengthDistribution(), br); | ||
} else if (line.startsWith(">>Sequence Duplication Levels")) { | ||
fastQc.getSummary().setSeqDuplicationLevels(status); | ||
parseSeqDuplicationLevels(fastQc.getSeqDuplicationLevels(), br); | ||
} else if (line.startsWith(">>Overrepresented sequences")) { | ||
fastQc.getSummary().setOverrepresentedSeqs(status); | ||
parseOverrepresentedSeqs(fastQc.getOverrepresentedSeqs(), br); | ||
} else if (line.startsWith(">>Adapter Content")) { | ||
fastQc.getSummary().setAdapterContent(status); | ||
parseAdapterContent(fastQc.getAdapterContent(), br); | ||
} else if (line.startsWith(">>Kmer Content")) { | ||
fastQc.getSummary().setKmerContent(status); | ||
parseKmerContent(fastQc.getKmerContent(), br); | ||
} | ||
} | ||
} | ||
fr.close(); | ||
|
||
return fastQc; | ||
|
||
} | ||
|
||
private static void parseKmerContent(List<KmerContent> kmerContent, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Sequence Count PValue Obs/Exp Max Max Obs/Exp Position | ||
kmerContent.add(new KmerContent(fields[0], Integer.parseInt(fields[1]), Double.parseDouble(fields[2]), | ||
Double.parseDouble(fields[3]), fields[4])); | ||
} | ||
} | ||
|
||
private static void parseAdapterContent(List<AdapterContent> adapterContent, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Position Illumina Universal Adapter Illumina Small RNA 3' Adapter Illumina Small RNA 5' Adapter Nextera Transposase Sequence SOLID S | ||
// mall RNA Adapter | ||
adapterContent.add(new AdapterContent(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]), | ||
Double.parseDouble(fields[3]), Double.parseDouble(fields[4]), Double.parseDouble(fields[5]))); | ||
} | ||
|
||
} | ||
|
||
private static void parseOverrepresentedSeqs(List<OverrepresentedSeq> overrepresentedSeqs, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Sequence Count Percentage Possible Source | ||
overrepresentedSeqs.add(new OverrepresentedSeq(fields[0], Integer.parseInt(fields[1]), Double.parseDouble(fields[2]), | ||
fields[3])); | ||
} | ||
} | ||
|
||
private static void parseSeqDuplicationLevels(List<SeqDuplicationLevel> seqDuplicationLevels, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Duplication Level Percentage of deduplicated Percentage of total | ||
seqDuplicationLevels.add(new SeqDuplicationLevel(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]))); | ||
} | ||
} | ||
|
||
private static void parseSeqLengthDistribution(Map<Integer, Double> seqLengthDistribution, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Length Count | ||
seqLengthDistribution.put(Integer.parseInt(fields[0]), Double.parseDouble(fields[1])); | ||
} | ||
} | ||
|
||
private static void parsePerBaseNContent(Map<String, Double> perBaseNContent, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Base N-Count | ||
perBaseNContent.put(fields[0], Double.parseDouble(fields[1])); | ||
} | ||
} | ||
|
||
private static void parsePerSeqGcContent(double[] perSeqGcContent, BufferedReader br) throws IOException { | ||
int i = 0; | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #GC Content Count | ||
perSeqGcContent[i++] = Double.parseDouble(fields[1]); | ||
} | ||
} | ||
|
||
private static void parsePerBaseSeqContent(List<PerBaseSeqContent> perBaseSeqContent, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Base G A T C | ||
perBaseSeqContent.add(new PerBaseSeqContent(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]), | ||
Double.parseDouble(fields[3]), Double.parseDouble(fields[4]))); | ||
} | ||
} | ||
|
||
private static void parsePerTileSeqQuality(List<PerTileSeqQuality> perTileSeqQualities, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Tile Base Mean | ||
perTileSeqQualities.add(new PerTileSeqQuality(fields[0], fields[1], Double.parseDouble(fields[2]))); | ||
} | ||
} | ||
|
||
private static void parsePerSeqQualityScores(Map<Integer, Double> perSeqQualityScores, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Quality Count | ||
perSeqQualityScores.put(Integer.parseInt(fields[0]), Double.parseDouble(fields[1])); | ||
} | ||
} | ||
|
||
private static void parsePerBaseSeqQuality(List<PerBaseSeqQuality> perBaseSequenceQualities, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
// #Base Mean Median Lower Quartile Upper Quartile 10th Percentile 90th Percentile | ||
perBaseSequenceQualities.add(new PerBaseSeqQuality(fields[0], Double.parseDouble(fields[1]), Double.parseDouble(fields[2]), | ||
Double.parseDouble(fields[3]), Double.parseDouble(fields[4]), Double.parseDouble(fields[5]), | ||
Double.parseDouble(fields[6]))); | ||
} | ||
} | ||
|
||
private static void parseBasicStatistics(Map<String, String> basicStats, BufferedReader br) throws IOException { | ||
String line; | ||
while ((line = br.readLine()) != null) { | ||
if (line.startsWith("#")) { | ||
continue; | ||
} | ||
if (line.startsWith(">>END_MODULE")) { | ||
return; | ||
} | ||
|
||
String[] fields = line.split("\t"); | ||
basicStats.put(fields[0], fields[1]); | ||
} | ||
} | ||
|
||
} |