-
Notifications
You must be signed in to change notification settings - Fork 591
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add better error for already written files/duplicate names
- Loading branch information
1 parent
ec9e551
commit de9c85c
Showing
8 changed files
with
261 additions
and
176 deletions.
There are no files selected for viewing
151 changes: 8 additions & 143 deletions
151
src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
92 changes: 92 additions & 0 deletions
92
...main/java/org/broadinstitute/hellbender/utils/gvs/parquet/GvsHeaderParquetFileWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
package org.broadinstitute.hellbender.utils.gvs.parquet; | ||
|
||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FileAlreadyExistsException; | ||
import org.apache.hadoop.fs.Path; | ||
import org.apache.parquet.column.ParquetProperties; | ||
import org.apache.parquet.hadoop.ParquetWriter; | ||
import org.apache.parquet.hadoop.metadata.CompressionCodecName; | ||
import org.apache.parquet.io.OutputFile; | ||
import org.apache.parquet.schema.MessageType; | ||
import org.json.JSONObject; | ||
|
||
import java.io.IOException; | ||
|
||
public class GvsHeaderParquetFileWriter extends ParquetWriter<JSONObject> { | ||
|
||
/** | ||
* This is very deprecated, and we'll need to figure out how to do this from a builder once it works! | ||
* @param file | ||
* @param schema | ||
* @param enableDictionary | ||
* @param codecName | ||
* @throws IOException | ||
*/ | ||
public GvsHeaderParquetFileWriter( | ||
Path file, | ||
MessageType schema, | ||
boolean enableDictionary, | ||
CompressionCodecName codecName | ||
) throws FileAlreadyExistsException, IOException { | ||
super(file, new GvsReferenceWriteSupport(schema), codecName, DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE, enableDictionary, false); | ||
} | ||
|
||
GvsHeaderParquetFileWriter( | ||
Path file, | ||
GvsVariantWriteSupport writeSupport, | ||
CompressionCodecName compressionCodecName, | ||
int blockSize, | ||
int pageSize, | ||
boolean enableDictionary, | ||
boolean enableValidation, | ||
ParquetProperties.WriterVersion writerVersion, | ||
Configuration conf) | ||
throws IOException { | ||
super( | ||
file, | ||
writeSupport, | ||
compressionCodecName, | ||
blockSize, | ||
pageSize, | ||
pageSize, | ||
enableDictionary, | ||
enableValidation, | ||
writerVersion, | ||
conf); | ||
} | ||
|
||
public static JSONObject writeJson(Long sampleId, String headerLineHash) { | ||
JSONObject record = new JSONObject(); | ||
record.put("sample_id", sampleId); | ||
record.put("headerLineHash", headerLineHash); | ||
return record; | ||
} | ||
|
||
public static class Builder extends ParquetWriter.Builder<JSONObject, Builder> { | ||
private MessageType schema = null; | ||
|
||
private Builder(Path file) { | ||
super(file); | ||
} | ||
|
||
private Builder(OutputFile file) { | ||
super(file); | ||
} | ||
|
||
public Builder withType(MessageType type) { | ||
this.schema = type; | ||
return this; | ||
} | ||
|
||
@Override | ||
protected Builder self() { | ||
return this; | ||
} | ||
|
||
@Override | ||
protected GvsVariantWriteSupport getWriteSupport(Configuration conf) { | ||
return new GvsVariantWriteSupport(schema); | ||
} | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
117 changes: 117 additions & 0 deletions
117
src/test/java/org/broadinstitute/hellbender/tools/gvs/ingest/VetCreatorUnitTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
package org.broadinstitute.hellbender.tools.gvs.ingest; | ||
|
||
import htsjdk.variant.variantcontext.*; | ||
import org.apache.hadoop.fs.FileAlreadyExistsException; | ||
import org.apache.parquet.schema.MessageType; | ||
import org.apache.parquet.schema.MessageTypeParser; | ||
import org.broadinstitute.hellbender.tools.gvs.common.CommonCode; | ||
import org.broadinstitute.hellbender.tools.gvs.common.IngestConstants; | ||
import org.broadinstitute.hellbender.tools.gvs.common.IngestUtils; | ||
import org.testng.Assert; | ||
import org.testng.annotations.Test; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.nio.file.Paths; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
import static htsjdk.variant.vcf.VCFConstants.DEPTH_KEY; | ||
import static org.broadinstitute.hellbender.utils.variant.GATKVCFConstants.*; | ||
|
||
public class VetCreatorUnitTest { | ||
private static final long SAMPLE_ID = 100; | ||
private static final String SAMPLE_NAME = "NA1"; | ||
private static final String PROJECT_ID = "test"; | ||
private static final String DATASET_NAME = "test"; | ||
private File outputDirectory = new File("quickstart/output/"); | ||
Path currentRelativePath = Paths.get(""); | ||
private final CommonCode.OutputType outputType = CommonCode.OutputType.PARQUET; | ||
private static final String VET_FILETYPE_PREFIX = "vet_"; // should this live somewhere else--check out IngestConstants for instance--why is that a tsv?!?! | ||
String PREFIX_SEPARATOR = "_"; // should this live somewhere else? | ||
|
||
int sampleTableNumber = IngestUtils.getTableNumber(SAMPLE_ID, IngestConstants.partitionPerTable); | ||
String tableNumber = String.format("%03d", sampleTableNumber); | ||
String sampleIdentifierForOutputFileName = "parquet"; | ||
public final MessageType PARQUET_SCHEMA = MessageTypeParser // do we want this in a utils file? or as part of a method? | ||
.parseMessageType(""" | ||
message VariantRow { | ||
required int64 sample_id; | ||
required int64 location; | ||
required binary ref (UTF8); | ||
required binary alt (UTF8); | ||
optional binary AS_RAW_MQ (UTF8); | ||
optional binary AS_RAW_MQRankSum (UTF8); | ||
optional binary AS_QUALapprox (UTF8); | ||
optional binary AS_RAW_ReadPosRankSum (UTF8); | ||
optional binary AS_SB_TABLE (UTF8); | ||
optional binary AS_VarDP (UTF8); | ||
required binary call_GT (UTF8); | ||
optional binary call_AD (UTF8); | ||
optional binary call_DP (UTF8); | ||
required int64 call_GQ; | ||
optional binary call_PGT (UTF8); | ||
optional binary call_PID (UTF8); | ||
optional binary call_PS (UTF8); | ||
optional binary call_PL (UTF8); | ||
} | ||
"""); | ||
|
||
|
||
|
||
@Test | ||
public void testParquetOutputFile() throws IOException { | ||
String fullPath = String.join(currentRelativePath.toAbsolutePath().toString(), outputDirectory.toString()); | ||
final File parquetOutputFile = new File(fullPath, VET_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + ".parquet"); | ||
|
||
String expected = String.join(fullPath, "vet_001_parquet.parquet"); | ||
Assert.assertEquals(parquetOutputFile.getAbsoluteFile(), expected); | ||
Files.deleteIfExists(parquetOutputFile.toPath()); | ||
} | ||
|
||
//@Test(expected = FileAlreadyExistsException.class) | ||
@Test | ||
public void testErrorFile() throws IOException { | ||
VariantContextBuilder builderA = | ||
new VariantContextBuilder("a","1",10329,10329, | ||
Arrays.asList(Allele.REF_C,Allele.ALT_A,Allele.NON_REF_ALLELE)); | ||
|
||
|
||
Genotype g = new GenotypeBuilder(SAMPLE_NAME) | ||
.alleles(Arrays.asList(Allele.REF_C, Allele.ALT_A)) | ||
.PL(new int[]{74,0,34,707,390,467}) | ||
.DP(64) | ||
.GQ(36) | ||
.AD(new int[]{22,42,0}) | ||
.attribute(STRAND_BIAS_BY_SAMPLE_KEY, "1,21,6,50") | ||
.make(); | ||
|
||
builderA.attribute(AS_RAW_RMS_MAPPING_QUALITY_KEY,"29707.00|39366.00|2405.00") | ||
.attribute(AS_RAW_MAP_QUAL_RANK_SUM_KEY,"|-0.2,1|-2.5,1") | ||
.attribute(RAW_QUAL_APPROX_KEY,"74") | ||
.attribute(AS_RAW_QUAL_APPROX_KEY,"|74|0") | ||
.attribute(AS_RAW_READ_POS_RANK_SUM_KEY,"|2.4,1|1.5,1") | ||
.attribute(AS_SB_TABLE_KEY,"1,21|3,39|3,11") | ||
.attribute(AS_VARIANT_DEPTH_KEY,"22|42|0") | ||
.genotypes(Arrays.asList(g)); | ||
|
||
VariantContext vc = builderA.make(); | ||
|
||
final File parquetOutputFile = new File(outputDirectory, VET_FILETYPE_PREFIX + tableNumber + PREFIX_SEPARATOR + sampleIdentifierForOutputFileName + ".parquet"); | ||
// Path tempFile = Files.createTempFile(parquetOutputFile.getAbsolutePath()); | ||
// Files.createTempFile("vet_001_parquet", ".parquet"); | ||
String sampleIdentifierForOutputFileName = "bleh"; | ||
|
||
VetCreator vetCreator = new VetCreator(parquetOutputFile.getName(), SAMPLE_ID, tableNumber, outputDirectory, outputType, PROJECT_ID, DATASET_NAME, true, false, PARQUET_SCHEMA); | ||
List<String> row = vetCreator.createRow(10329,vc, SAMPLE_NAME); | ||
|
||
Assert.assertEquals("/ by zero", row.get(0)); | ||
Files.deleteIfExists(parquetOutputFile.toPath()); | ||
|
||
// Assert.assertEquals("/ by zero", ); | ||
} | ||
|
||
} |