diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index efd3443..606cd7c 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -5,9 +5,9 @@ name: Go on: push: - branches: [ "master" ] + branches: [ "main", "dev" ] pull_request: - branches: [ "master" ] + branches: [ "main", "dev" ] jobs: diff --git a/README.md b/README.md index a5fc68f..21b8a8d 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ svync --config --input | --- | --- | --- | | `--output`/`-o` | Path to the output VCF file | `stdout` | | `--nodate`/`--nd` | Do not add the date to the output VCF file | `false` | -| `--notation`/`-n` | The notation to use for the output VCF file. Must be one of: breakpoint, breakend. | none | +| `--mute-warnings`/`--mw` | Do not output warnings | `false` | ## Configuration The configuration file is the core of the standardization in Svync. More information can be found in the [configuration documentation](docs/configuration.md). diff --git a/svync.go b/svync.go index dd05ca0..eb5b780 100644 --- a/svync.go +++ b/svync.go @@ -3,8 +3,6 @@ package main import ( "log" "os" - "slices" - "strings" "github.com/nvnieuwk/svync/svync_api" cli "github.com/urfave/cli/v2" @@ -29,23 +27,23 @@ func main() { Usage: "The location to the output VCF file, defaults to stdout", Category: "Optional", }, - &cli.StringFlag{ - Name: "notation", - Aliases: []string{"n"}, - Usage: "The notation to use for the output VCF file. Must be one of: breakpoint, breakend. By default the notation isn't changed", + // TODO re-add this when conversion is implemented + // &cli.BoolFlag{ + // Name: "to-breakpoint", + // Aliases: []string{"tb"}, + // Usage: "Convert pairs of breakends to a single breakpoint variant. WARNING: this will cause some loss of data.", + // Category: "Optional", + // }, + &cli.BoolFlag{ + Name: "mute-warnings", + Aliases: []string{"mw"}, + Usage: "Mute all warnings.", Category: "Optional", - Action: func(c *cli.Context, input string) error { - validNotations := []string{"breakpoint", "breakend"} - if slices.Contains(validNotations, input) { - return nil - } - return cli.Exit("Invalid notation '"+input+"', must be one of: "+strings.Join(validNotations, ", "), 1) - }, }, &cli.StringFlag{ Name: "config", Aliases: []string{"c"}, - Usage: "Configuration file (YAML) to use for the parsing of INFO and FORMAT fields", + Usage: "Configuration file (YAML) used for standardizing the VCF", Required: true, Category: "Required", }, @@ -59,8 +57,7 @@ func main() { }, Action: func(Cctx *cli.Context) error { config := svync_api.ReadConfig(Cctx) - vcf := svync_api.ReadVcf(Cctx) - vcf.StandardizeAndOutput(config, Cctx) // Standardize VCF and write to output file + svync_api.Execute(Cctx, config) return nil }, } diff --git a/svync_api/read.go b/svync_api/execute.go similarity index 60% rename from svync_api/read.go rename to svync_api/execute.go index a9c0128..f04c8db 100644 --- a/svync_api/read.go +++ b/svync_api/execute.go @@ -15,60 +15,80 @@ import ( ) // Read the VCF file and return it as a VCF struct -func ReadVcf(Cctx *cli.Context) *VCF { +func Execute(Cctx *cli.Context, config *Config) { logger := log.New(os.Stderr, "", 0) file := Cctx.String("input") - openFile, err := os.Open(file) - defer openFile.Close() + inputVcf, err := os.Open(file) + defer inputVcf.Close() if err != nil { logger.Fatal(err) } - - vcf := newVCF() - if strings.HasSuffix(file, ".gz") { - vcf.readBgzip(openFile) - } else { - vcf.readPlain(openFile) - } - - return vcf -} - -// Initialize a new VCF -func newVCF() *VCF { - return &VCF{ - Header: Header{ - Info: map[string]HeaderLineIdNumberTypeDescription{}, - Format: map[string]HeaderLineIdNumberTypeDescription{}, - Alt: map[string]HeaderLineIdDescription{}, - Filter: map[string]HeaderLineIdDescription{}, - Contig: []HeaderLineIdLength{}, - }, - Variants: map[string]Variant{}, - } -} - -// Read the VCF file in bgzip format and convert it to a VCF struct -func (vcf *VCF) readBgzip(input *os.File) { - logger := log.New(os.Stderr, "", 0) - - bgReader, err := bgzf.NewReader(input, 1) - if err != nil { - logger.Fatal(err) + header := newHeader() + breakEndVariants := &map[string]Variant{} + headerIsMade := false + variantCount := 0 + + stdout := true + var outputFile *os.File + if Cctx.String("output") != "" { + stdout = false + outputFile, err = os.Create(Cctx.String("output")) + if err != nil { + logger.Fatalf("Failed to create the output file: %v", err) + } + defer outputFile.Close() } - defer bgReader.Close() - for { - b, _, err := readBgzipLine(bgReader) + if strings.HasSuffix(file, ".gz") { + bgReader, err := bgzf.NewReader(inputVcf, 1) if err != nil { - if err == io.EOF { - break + logger.Fatal(err) + } + defer bgReader.Close() + + for { + b, _, err := readBgzipLine(bgReader) + if err != nil { + if err == io.EOF { + break + } + logger.Fatal(string(b[:])) } - logger.Fatal(string(b[:])) + + parseLine( + string(bytes.TrimSpace(b[:])), + header, + breakEndVariants, + config, + Cctx, + &headerIsMade, + outputFile, + stdout, + &variantCount, + ) + } + } else { + scanner := bufio.NewScanner(inputVcf) + const maxCapacity = 8 * 1000000 // 8 MB + scanner.Buffer(make([]byte, maxCapacity), maxCapacity) + for scanner.Scan() { + parseLine( + scanner.Text(), + header, + breakEndVariants, + config, + Cctx, + &headerIsMade, + outputFile, + stdout, + &variantCount, + ) } - vcf.parse(string(bytes.TrimSpace(b[:]))) + if err := scanner.Err(); err != nil { + logger.Fatal(err) + } } } @@ -95,44 +115,59 @@ func readBgzipLine(r *bgzf.Reader) ([]byte, bgzf.Chunk, error) { return data, chunk, err } -// Read the VCF file in plain text format and convert it to a VCF struct -func (vcf *VCF) readPlain(input *os.File) { - logger := log.New(os.Stderr, "", 0) - - scanner := bufio.NewScanner(input) - const maxCapacity = 8 * 1000000 // 8 MB - scanner.Buffer(make([]byte, maxCapacity), maxCapacity) - for scanner.Scan() { - vcf.parse(scanner.Text()) - } - - if err := scanner.Err(); err != nil { - logger.Fatal(err) +// Parse the line and add it to the VCF struct +func parseLine( + line string, + header *Header, + breakEndVariants *map[string]Variant, + config *Config, + Cctx *cli.Context, + headerIsMade *bool, + outputFile *os.File, + stdout bool, + variantCount *int, +) { + if !strings.HasSuffix(line, "#") && !*headerIsMade { + writeHeader(config, Cctx, header, outputFile, stdout) + *headerIsMade = true } -} - -// Parse the line and add it to the VCF struct -func (vcf *VCF) parse(line string) { if strings.HasPrefix(line, "#") { - vcf.Header.parse(line) + header.parse(line) } else { - id := strings.Split(line, "\t")[2] - variant := &Variant{} - variant.Header = &vcf.Header - variant.parse(line) - vcf.Variants[id] = *variant - // logger.Println(vcf.Variants[id]) + // id := strings.Split(line, "\t")[2] + variant := createVariant(line, header, Cctx) + + // TODO continue work on this later + // Convert breakends to breakpoints if the --to-breakpoint flag is set + // if Cctx.Bool("to-breakpoint") && variant.Info["SVTYPE"][0] == "BND" && len(variant.Info["MATEID"]) == 1 { + // mateid := variant.Info["MATEID"][0] + // if mate, ok := (*breakEndVariants)[mateid]; ok { + // variant = toBreakPoint(variant, &mate) + // delete(*breakEndVariants, mateid) + // } else { + // (*breakEndVariants)[id] = *variant + // return + // } + // } + *variantCount++ + standardizeAndOutput(config, Cctx, variant, outputFile, stdout, *variantCount) + + // Standardize and output the variant } } // Parse the line and add it to the Variant struct -func (variant *Variant) parse(line string) { +func createVariant(line string, header *Header, Cctx *cli.Context) *Variant { logger := log.New(os.Stderr, "", 0) - err := error(nil) + variant := new(Variant) + variant.Header = header + data := strings.Split(line, "\t") variant.Chromosome = data[0] + + var err error variant.Pos, err = strconv.ParseInt(data[1], 0, 64) if err != nil { logger.Fatal(err) @@ -152,7 +187,7 @@ func (variant *Variant) parse(line string) { if len(split) > 1 { value = split[1] } - variant.Info[field] = parseInfoFormat(field, value, variant.Header.Info) + variant.Info[field] = parseInfoFormat(field, value, variant.Header.Info, Cctx) } variant.Format = map[string]VariantFormat{} @@ -166,18 +201,22 @@ func (variant *Variant) parse(line string) { } for idx, val := range strings.Split(value, ":") { header := formatHeaders[idx] - variant.Format[sample].Content[header] = parseInfoFormat(header, val, variant.Header.Format) + variant.Format[sample].Content[header] = parseInfoFormat(header, val, variant.Header.Format, Cctx) } } + return variant + } // Parse the value of the INFO or FORMAT field and return it as a slice of strings -func parseInfoFormat(header string, value string, infoFormatLines map[string]HeaderLineIdNumberTypeDescription) []string { +func parseInfoFormat(header string, value string, infoFormatLines map[string]HeaderLineIdNumberTypeDescription, Cctx *cli.Context) []string { logger := log.New(os.Stderr, "", 0) headerLine := infoFormatLines[header] if headerLine == (HeaderLineIdNumberTypeDescription{}) { - logger.Printf("Field %s not found in header, defaulting to Type 'String' and Number '1'", header) + if !Cctx.Bool("mute-warnings") { + logger.Printf("Field %s not found in header, defaulting to Type 'String' and Number '1'", header) + } headerLine = HeaderLineIdNumberTypeDescription{ Id: header, Number: "1", @@ -287,3 +326,16 @@ func convertLineToMap(line string) map[string]string { return data } + +// Create a new header struct +func newHeader() *Header { + return &Header{ + Info: map[string]HeaderLineIdNumberTypeDescription{}, + Format: map[string]HeaderLineIdNumberTypeDescription{}, + Alt: map[string]HeaderLineIdDescription{}, + Filter: map[string]HeaderLineIdDescription{}, + Contig: []HeaderLineIdLength{}, + Other: []string{}, + Samples: []string{}, + } +} diff --git a/svync_api/functions.go b/svync_api/functions.go index 8218e64..5b77449 100644 --- a/svync_api/functions.go +++ b/svync_api/functions.go @@ -39,7 +39,7 @@ func resolveFunction(input string, token string) string { case "len": result += fmt.Sprint(len(value[0])) default: - logger.Fatalf("The function '%s' is not supported", value[1:]) + logger.Fatalf("The function '%s' is not supported", function) } return result } diff --git a/svync_api/resolve.go b/svync_api/resolve.go index 747ac6b..357bc3f 100644 --- a/svync_api/resolve.go +++ b/svync_api/resolve.go @@ -7,10 +7,12 @@ import ( "regexp" "strconv" "strings" + + cli "github.com/urfave/cli/v2" ) // Resolve a value -func ResolveValue(input string, variant *Variant, format *VariantFormat) string { +func ResolveValue(input string, variant *Variant, format *VariantFormat, Cctx *cli.Context) string { logger := log.New(os.Stderr, "", 0) // Replace all the FORMAT fields @@ -27,7 +29,9 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat) string // TODO implement some alternative way to handle missing fields if !ok { - logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant", field, variant.Id) + if !Cctx.Bool("mute-warnings") { + logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant", field, variant.Id) + } } else if len(fieldSlice) > 2 { index, err := strconv.ParseInt(fieldSlice[2], 0, 64) if err != nil { @@ -50,7 +54,7 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat) string // TODO implement some alternative way to handle missing fields if !ok { infoType := variant.Header.Info[field].Type - if infoType != "Flag" { + if infoType != "Flag" && !Cctx.Bool("mute-warnings") { logger.Printf("The field %s is not present in the INFO fields of the variant with ID %s, excluding it from this variant", field, variant.Id) } } else if len(fieldSlice) > 2 { diff --git a/svync_api/standardize.go b/svync_api/standardize.go index 15ea867..bc5ae77 100644 --- a/svync_api/standardize.go +++ b/svync_api/standardize.go @@ -2,7 +2,6 @@ package svync_api import ( "fmt" - "log" "os" "regexp" "sort" @@ -14,21 +13,7 @@ import ( "golang.org/x/text/language" ) -// Standardize the VCF file and write it to the output file -func (vcf *VCF) StandardizeAndOutput(config *Config, Cctx *cli.Context) { - logger := log.New(os.Stderr, "", 0) - stdout := true - var file *os.File - var err error - if Cctx.String("output") != "" { - stdout = false - file, err = os.Create(Cctx.String("output")) - if err != nil { - logger.Fatalf("Failed to create the output file: %v", err) - } - defer file.Close() - } - +func writeHeader(config *Config, Cctx *cli.Context, header *Header, file *os.File, stdout bool) { // VCF version writeLine("##fileformat=VCFv4.2", file, stdout) @@ -42,7 +27,7 @@ func (vcf *VCF) StandardizeAndOutput(config *Config, Cctx *cli.Context) { descriptionRegex := regexp.MustCompile(`["']?([^"']*)["']?`) // ALT header lines - for _, alt := range vcf.Header.Alt { + for _, alt := range header.Alt { altId := alt.Id if newAlt, ok := config.Alt[altId]; ok { altId = newAlt @@ -53,7 +38,7 @@ func (vcf *VCF) StandardizeAndOutput(config *Config, Cctx *cli.Context) { } // FILTER header lines - for _, filter := range vcf.Header.Filter { + for _, filter := range header.Filter { description := descriptionRegex.FindStringSubmatch(filter.Description)[1] filterLine := fmt.Sprintf("##FILTER=", filter.Id, description) writeLine(filterLine, file, stdout) @@ -76,39 +61,24 @@ func (vcf *VCF) StandardizeAndOutput(config *Config, Cctx *cli.Context) { } // Write the contig fields - for _, contig := range vcf.Header.Contig { + for _, contig := range header.Contig { contigLine := fmt.Sprintf("##contig=", contig.Id, contig.Length) writeLine(contigLine, file, stdout) } // Write the column headers columnHeaders := []string{"#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"} - columnHeaders = append(columnHeaders, vcf.Header.Samples...) + columnHeaders = append(columnHeaders, header.Samples...) writeLine(strings.Join(columnHeaders, "\t"), file, stdout) +} + +// Standardize the VCF file and write it to the output file +func standardizeAndOutput(config *Config, Cctx *cli.Context, variant *Variant, file *os.File, stdout bool, variantCount int) { + + // Standardize the variant + line := variant.standardize(config, Cctx, variantCount).String(config) + writeLine(line, file, stdout) - // Write the variants - variantCount := 0 - for _, variant := range vcf.Variants { - // Standardize the variant - if variant.Parsed { - continue - } - line := "" - if Cctx.String("notation") == "breakpoint" { - if variant.Info["SVTYPE"][0] == "BND" { - line = variant.toBreakPoint(vcf).standardize(config, Cctx, variantCount).String(config) - } else { - line = variant.standardize(config, Cctx, variantCount).String(config) - } - } else if Cctx.String("notation") == "breakend" { - variant1, variant2 := variant.standardize(config, Cctx, variantCount).toBreakEnd() - line = variant1.String(config) + "\n" + variant2.String(config) - } else { - line = variant.standardize(config, Cctx, variantCount).String(config) - } - variantCount++ - writeLine(line, file, stdout) - } } // Standardize the variant using the config @@ -132,7 +102,7 @@ func (variant *Variant) standardize(config *Config, Cctx *cli.Context, count int } } - standardizedVariant.Id = fmt.Sprintf("%s_%v", ResolveValue(config.Id, variant, nil), count) + standardizedVariant.Id = fmt.Sprintf("%s_%v", ResolveValue(config.Id, variant, nil, Cctx), count) // Add info fields for name, infoConfig := range config.Info { @@ -144,7 +114,7 @@ func (variant *Variant) standardize(config *Config, Cctx *cli.Context, count int if value == "" { continue } - standardizedVariant.Info[name] = []string{ResolveValue(value, variant, nil)} + standardizedVariant.Info[name] = []string{ResolveValue(value, variant, nil, Cctx)} } // Add format fields @@ -157,7 +127,7 @@ func (variant *Variant) standardize(config *Config, Cctx *cli.Context, count int if val, ok := formatConfig.Alts[sVType]; ok { value = val } - newFormat.Content[name] = []string{ResolveValue(value, variant, &format)} + newFormat.Content[name] = []string{ResolveValue(value, variant, &format, Cctx)} } standardizedVariant.Format[sample] = *newFormat } diff --git a/svync_api/structs.go b/svync_api/structs.go index e6633bc..4e196f7 100644 --- a/svync_api/structs.go +++ b/svync_api/structs.go @@ -1,10 +1,6 @@ package svync_api // VCF structs -type VCF struct { - Header Header - Variants map[string]Variant -} type Header struct { Info map[string]HeaderLineIdNumberTypeDescription diff --git a/svync_api/variant.go b/svync_api/variant.go index dbd500d..7eaaba5 100644 --- a/svync_api/variant.go +++ b/svync_api/variant.go @@ -2,43 +2,28 @@ package svync_api import ( "fmt" - "log" "math" - "os" "regexp" - "slices" "strconv" "strings" ) // Convert a breakend variant pair to one breakpoint -func (variant *Variant) toBreakPoint(vcf *VCF) *Variant { - logger := log.New(os.Stderr, "", 0) - mateIds := variant.Info["MATEID"] +func toBreakPoint(mate1 *Variant, mate2 *Variant) *Variant { - // Don't convert if less or more than 1 mate are found - if len(mateIds) == 0 || len(mateIds) > 1 { - return variant + // Swap mates if the first one comes after the second one + if mate1.Chromosome == mate2.Chromosome && mate1.Pos > mate2.Pos { + mate1, mate2 = mate2, mate1 } - mateId := mateIds[0] - mateVariant, ok := vcf.Variants[mateId] - if ok { - mateVariant.Parsed = true - vcf.Variants[mateId] = mateVariant - } + alt := mate1.Alt + chr := mate1.Chromosome + pos := mate1.Pos + chr2 := mate2.Chromosome + pos2 := mate2.Pos - alt := variant.Alt altRegex := regexp.MustCompile(`(\[|\])(?P[^:]*):(?P[0-9]*)`) altGroups := altRegex.FindStringSubmatch(alt) - - chr := variant.Chromosome - pos := variant.Pos - chr2 := altGroups[2] - pos2, err := strconv.ParseInt(altGroups[3], 0, 64) - if err != nil { - logger.Fatalf("Couldn't convert string to integer: %v", err) - } bracket := altGroups[1] strand1 := "" strand2 := "" @@ -56,12 +41,12 @@ func (variant *Variant) toBreakPoint(vcf *VCF) *Variant { } filter := "." - if variant.Filter == mateVariant.Filter { - filter = variant.Filter + if mate1.Filter == mate2.Filter { + filter = mate1.Filter } - varQual, err := strconv.ParseFloat(variant.Qual, 64) - mateQual, err := strconv.ParseFloat(mateVariant.Qual, 64) + varQual, err := strconv.ParseFloat(mate1.Qual, 64) + mateQual, err := strconv.ParseFloat(mate2.Qual, 64) qual := "." if err == nil { qual = fmt.Sprintf("%f", (varQual+mateQual)/2) @@ -70,13 +55,13 @@ func (variant *Variant) toBreakPoint(vcf *VCF) *Variant { breakpointVariant := &Variant{ Chromosome: chr, Pos: pos, - Id: variant.Id, - Ref: variant.Ref, + Id: mate1.Id, + Ref: mate1.Ref, Filter: filter, Qual: qual, - Header: variant.Header, - Info: variant.Info, - Format: variant.Format, + Header: mate1.Header, + Info: mate1.Info, + Format: mate1.Format, } breakpointVariant.Info["END"] = []string{fmt.Sprint(pos2)} @@ -102,14 +87,10 @@ func (variant *Variant) toBreakPoint(vcf *VCF) *Variant { svlen = strconv.FormatFloat(-math.Abs(float64(pos2-pos)), 'g', -1, 64) } - if svtype != "" { - breakpointVariant.Alt = svtype - breakpointVariant.Info["SVTYPE"] = []string{svtype} - breakpointVariant.Info["SVLEN"] = []string{svlen} - return breakpointVariant - } - - return variant + breakpointVariant.Alt = svtype + breakpointVariant.Info["SVTYPE"] = []string{svtype} + breakpointVariant.Info["SVLEN"] = []string{svlen} + return breakpointVariant } // Get the length of an insertion @@ -119,73 +100,3 @@ func getInsLen(alt string, strand string, bracket string) int { } return len(alt[:strings.LastIndex(alt, bracket)]) } - -// Convert a breakpoint to variant to its breakend pairs -// -// Determining the ALT fields is impossible for breakpoint -> breakend conversion -func (variant *Variant) toBreakEnd() (*Variant, *Variant) { - logger := log.New(os.Stderr, "", 0) - - id1 := variant.Id + "_01" - id2 := variant.Id + "_02" - - end, err := strconv.ParseInt(variant.Info["END"][0], 0, 64) - if err != nil { - logger.Fatal(err) - } - - chr := variant.Chromosome - chr2 := "" - chrom2, ok := variant.Info["CHR2"] - if !ok { - chr2 = variant.Chromosome - } else { - chr2 = chrom2[0] - } - - info1 := map[string][]string{} - info2 := map[string][]string{} - for name, info := range variant.Info { - if slices.Contains([]string{"CHR2", "END", "SVLEN"}, name) { - continue - } - info1[name] = info - info2[name] = info - } - - info1["SVTYPE"] = []string{"BND"} - info2["SVTYPE"] = []string{"BND"} - info1["MATEID"] = []string{id2} - info2["MATEID"] = []string{id1} - - alt1 := "." - alt2 := "." - - breakend1 := &Variant{ - Chromosome: chr, - Pos: variant.Pos, - Id: id1, - Alt: alt1, - Ref: variant.Ref, - Qual: variant.Qual, - Filter: variant.Filter, - Header: variant.Header, - Info: info1, - Format: variant.Format, - } - - breakend2 := &Variant{ - Chromosome: chr2, - Pos: end, - Id: id2, - Alt: alt2, - Ref: "N", // TODO find a good way to determine the breakend2 REF - Qual: variant.Qual, - Filter: variant.Filter, - Header: variant.Header, - Info: info2, - Format: variant.Format, - } - - return breakend1, breakend2 -}