From 9d0b21c7c845f854722ae23d6686e517fd4bd89a Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jan 2024 11:17:31 +0100 Subject: [PATCH 01/10] refactor done, just some small issues to fix --- svync.go | 20 +--- svync_api/{read.go => execute.go} | 181 +++++++++++++++++++----------- svync_api/standardize.go | 56 +++------ svync_api/structs.go | 4 - svync_api/variant.go | 51 +++------ 5 files changed, 149 insertions(+), 163 deletions(-) rename svync_api/{read.go => execute.go} (66%) diff --git a/svync.go b/svync.go index dd05ca0..65eeded 100644 --- a/svync.go +++ b/svync.go @@ -3,8 +3,6 @@ package main import ( "log" "os" - "slices" - "strings" "github.com/nvnieuwk/svync/svync_api" cli "github.com/urfave/cli/v2" @@ -29,18 +27,11 @@ func main() { Usage: "The location to the output VCF file, defaults to stdout", Category: "Optional", }, - &cli.StringFlag{ - Name: "notation", - Aliases: []string{"n"}, - Usage: "The notation to use for the output VCF file. Must be one of: breakpoint, breakend. By default the notation isn't changed", + &cli.BoolFlag{ + Name: "to-breakpoint", + Aliases: []string{"tb"}, + Usage: "Convert pairs of breakends to a single breakpoint variant. WARNING: this will cause some loss of data.", Category: "Optional", - Action: func(c *cli.Context, input string) error { - validNotations := []string{"breakpoint", "breakend"} - if slices.Contains(validNotations, input) { - return nil - } - return cli.Exit("Invalid notation '"+input+"', must be one of: "+strings.Join(validNotations, ", "), 1) - }, }, &cli.StringFlag{ Name: "config", @@ -59,8 +50,7 @@ func main() { }, Action: func(Cctx *cli.Context) error { config := svync_api.ReadConfig(Cctx) - vcf := svync_api.ReadVcf(Cctx) - vcf.StandardizeAndOutput(config, Cctx) // Standardize VCF and write to output file + svync_api.Execute(Cctx, config) return nil }, } diff --git a/svync_api/read.go b/svync_api/execute.go similarity index 66% rename from svync_api/read.go rename to svync_api/execute.go index a9c0128..997ed6a 100644 --- a/svync_api/read.go +++ b/svync_api/execute.go @@ -15,60 +15,80 @@ import ( ) // Read the VCF file and return it as a VCF struct -func ReadVcf(Cctx *cli.Context) *VCF { +func Execute(Cctx *cli.Context, config *Config) { logger := log.New(os.Stderr, "", 0) file := Cctx.String("input") - openFile, err := os.Open(file) - defer openFile.Close() + inputVcf, err := os.Open(file) + defer inputVcf.Close() if err != nil { logger.Fatal(err) } - - vcf := newVCF() - if strings.HasSuffix(file, ".gz") { - vcf.readBgzip(openFile) - } else { - vcf.readPlain(openFile) - } - - return vcf -} - -// Initialize a new VCF -func newVCF() *VCF { - return &VCF{ - Header: Header{ - Info: map[string]HeaderLineIdNumberTypeDescription{}, - Format: map[string]HeaderLineIdNumberTypeDescription{}, - Alt: map[string]HeaderLineIdDescription{}, - Filter: map[string]HeaderLineIdDescription{}, - Contig: []HeaderLineIdLength{}, - }, - Variants: map[string]Variant{}, - } -} - -// Read the VCF file in bgzip format and convert it to a VCF struct -func (vcf *VCF) readBgzip(input *os.File) { - logger := log.New(os.Stderr, "", 0) - - bgReader, err := bgzf.NewReader(input, 1) - if err != nil { - logger.Fatal(err) + header := newHeader() + breakEndVariants := &map[string]Variant{} + headerIsMade := false + variantCount := 0 + + stdout := true + var outputFile *os.File + if Cctx.String("output") != "" { + stdout = false + outputFile, err = os.Create(Cctx.String("output")) + if err != nil { + logger.Fatalf("Failed to create the output file: %v", err) + } + defer outputFile.Close() } - defer bgReader.Close() - for { - b, _, err := readBgzipLine(bgReader) + if strings.HasSuffix(file, ".gz") { + bgReader, err := bgzf.NewReader(inputVcf, 1) if err != nil { - if err == io.EOF { - break + logger.Fatal(err) + } + defer bgReader.Close() + + for { + b, _, err := readBgzipLine(bgReader) + if err != nil { + if err == io.EOF { + break + } + logger.Fatal(string(b[:])) } - logger.Fatal(string(b[:])) + + parseLine( + string(bytes.TrimSpace(b[:])), + header, + breakEndVariants, + config, + Cctx, + &headerIsMade, + outputFile, + stdout, + &variantCount, + ) + } + } else { + scanner := bufio.NewScanner(inputVcf) + const maxCapacity = 8 * 1000000 // 8 MB + scanner.Buffer(make([]byte, maxCapacity), maxCapacity) + for scanner.Scan() { + parseLine( + scanner.Text(), + header, + breakEndVariants, + config, + Cctx, + &headerIsMade, + outputFile, + stdout, + &variantCount, + ) } - vcf.parse(string(bytes.TrimSpace(b[:]))) + if err := scanner.Err(); err != nil { + logger.Fatal(err) + } } } @@ -95,44 +115,56 @@ func readBgzipLine(r *bgzf.Reader) ([]byte, bgzf.Chunk, error) { return data, chunk, err } -// Read the VCF file in plain text format and convert it to a VCF struct -func (vcf *VCF) readPlain(input *os.File) { - logger := log.New(os.Stderr, "", 0) - - scanner := bufio.NewScanner(input) - const maxCapacity = 8 * 1000000 // 8 MB - scanner.Buffer(make([]byte, maxCapacity), maxCapacity) - for scanner.Scan() { - vcf.parse(scanner.Text()) - } - - if err := scanner.Err(); err != nil { - logger.Fatal(err) +// Parse the line and add it to the VCF struct +func parseLine( + line string, + header *Header, + breakEndVariants *map[string]Variant, + config *Config, + Cctx *cli.Context, + headerIsMade *bool, + outputFile *os.File, + stdout bool, + variantCount *int, +) { + if !strings.HasSuffix(line, "#") && !*headerIsMade { + writeHeader(config, Cctx, header, outputFile, stdout) } -} - -// Parse the line and add it to the VCF struct -func (vcf *VCF) parse(line string) { if strings.HasPrefix(line, "#") { - vcf.Header.parse(line) + header.parse(line) } else { id := strings.Split(line, "\t")[2] - variant := &Variant{} - variant.Header = &vcf.Header - variant.parse(line) - vcf.Variants[id] = *variant - // logger.Println(vcf.Variants[id]) + variant := createVariant(line) + variant.Header = header + + // Convert breakends to breakpoints if the --to-breakpoint flag is set + if Cctx.Bool("to-breakpoint") && variant.Info["SVTYPE"][0] == "BND" && len(variant.Info["MATEID"]) == 1 { + mateid := variant.Info["MATEID"][0] + if mate, ok := (*breakEndVariants)[mateid]; ok { + variant = toBreakPoint(variant, &mate) + delete(*breakEndVariants, mateid) + } else { + (*breakEndVariants)[id] = *variant + return + } + } + *variantCount++ + + // Standardize and output the variant } } // Parse the line and add it to the Variant struct -func (variant *Variant) parse(line string) { +func createVariant(line string) *Variant { logger := log.New(os.Stderr, "", 0) - err := error(nil) + variant := Variant{} + data := strings.Split(line, "\t") variant.Chromosome = data[0] + + var err error variant.Pos, err = strconv.ParseInt(data[1], 0, 64) if err != nil { logger.Fatal(err) @@ -170,6 +202,8 @@ func (variant *Variant) parse(line string) { } } + return &variant + } // Parse the value of the INFO or FORMAT field and return it as a slice of strings @@ -287,3 +321,16 @@ func convertLineToMap(line string) map[string]string { return data } + +// Create a new header struct +func newHeader() *Header { + return &Header{ + Info: map[string]HeaderLineIdNumberTypeDescription{}, + Format: map[string]HeaderLineIdNumberTypeDescription{}, + Alt: map[string]HeaderLineIdDescription{}, + Filter: map[string]HeaderLineIdDescription{}, + Contig: []HeaderLineIdLength{}, + Other: []string{}, + Samples: []string{}, + } +} diff --git a/svync_api/standardize.go b/svync_api/standardize.go index 15ea867..aa0c445 100644 --- a/svync_api/standardize.go +++ b/svync_api/standardize.go @@ -2,7 +2,6 @@ package svync_api import ( "fmt" - "log" "os" "regexp" "sort" @@ -14,21 +13,7 @@ import ( "golang.org/x/text/language" ) -// Standardize the VCF file and write it to the output file -func (vcf *VCF) StandardizeAndOutput(config *Config, Cctx *cli.Context) { - logger := log.New(os.Stderr, "", 0) - stdout := true - var file *os.File - var err error - if Cctx.String("output") != "" { - stdout = false - file, err = os.Create(Cctx.String("output")) - if err != nil { - logger.Fatalf("Failed to create the output file: %v", err) - } - defer file.Close() - } - +func writeHeader(config *Config, Cctx *cli.Context, header *Header, file *os.File, stdout bool) { // VCF version writeLine("##fileformat=VCFv4.2", file, stdout) @@ -42,7 +27,7 @@ func (vcf *VCF) StandardizeAndOutput(config *Config, Cctx *cli.Context) { descriptionRegex := regexp.MustCompile(`["']?([^"']*)["']?`) // ALT header lines - for _, alt := range vcf.Header.Alt { + for _, alt := range header.Alt { altId := alt.Id if newAlt, ok := config.Alt[altId]; ok { altId = newAlt @@ -53,7 +38,7 @@ func (vcf *VCF) StandardizeAndOutput(config *Config, Cctx *cli.Context) { } // FILTER header lines - for _, filter := range vcf.Header.Filter { + for _, filter := range header.Filter { description := descriptionRegex.FindStringSubmatch(filter.Description)[1] filterLine := fmt.Sprintf("##FILTER=", filter.Id, description) writeLine(filterLine, file, stdout) @@ -76,39 +61,24 @@ func (vcf *VCF) StandardizeAndOutput(config *Config, Cctx *cli.Context) { } // Write the contig fields - for _, contig := range vcf.Header.Contig { + for _, contig := range header.Contig { contigLine := fmt.Sprintf("##contig=", contig.Id, contig.Length) writeLine(contigLine, file, stdout) } // Write the column headers columnHeaders := []string{"#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT"} - columnHeaders = append(columnHeaders, vcf.Header.Samples...) + columnHeaders = append(columnHeaders, header.Samples...) writeLine(strings.Join(columnHeaders, "\t"), file, stdout) +} + +// Standardize the VCF file and write it to the output file +func standardizeAndOutput(config *Config, Cctx *cli.Context, variant *Variant, file *os.File, stdout bool, variantCount int) { + + // Standardize the variant + line := variant.standardize(config, Cctx, variantCount).String(config) + writeLine(line, file, stdout) - // Write the variants - variantCount := 0 - for _, variant := range vcf.Variants { - // Standardize the variant - if variant.Parsed { - continue - } - line := "" - if Cctx.String("notation") == "breakpoint" { - if variant.Info["SVTYPE"][0] == "BND" { - line = variant.toBreakPoint(vcf).standardize(config, Cctx, variantCount).String(config) - } else { - line = variant.standardize(config, Cctx, variantCount).String(config) - } - } else if Cctx.String("notation") == "breakend" { - variant1, variant2 := variant.standardize(config, Cctx, variantCount).toBreakEnd() - line = variant1.String(config) + "\n" + variant2.String(config) - } else { - line = variant.standardize(config, Cctx, variantCount).String(config) - } - variantCount++ - writeLine(line, file, stdout) - } } // Standardize the variant using the config diff --git a/svync_api/structs.go b/svync_api/structs.go index e6633bc..4e196f7 100644 --- a/svync_api/structs.go +++ b/svync_api/structs.go @@ -1,10 +1,6 @@ package svync_api // VCF structs -type VCF struct { - Header Header - Variants map[string]Variant -} type Header struct { Info map[string]HeaderLineIdNumberTypeDescription diff --git a/svync_api/variant.go b/svync_api/variant.go index dbd500d..b861636 100644 --- a/svync_api/variant.go +++ b/svync_api/variant.go @@ -12,28 +12,15 @@ import ( ) // Convert a breakend variant pair to one breakpoint -func (variant *Variant) toBreakPoint(vcf *VCF) *Variant { +func toBreakPoint(mate1 *Variant, mate2 *Variant) *Variant { logger := log.New(os.Stderr, "", 0) - mateIds := variant.Info["MATEID"] - // Don't convert if less or more than 1 mate are found - if len(mateIds) == 0 || len(mateIds) > 1 { - return variant - } - - mateId := mateIds[0] - mateVariant, ok := vcf.Variants[mateId] - if ok { - mateVariant.Parsed = true - vcf.Variants[mateId] = mateVariant - } - - alt := variant.Alt + alt := mate1.Alt altRegex := regexp.MustCompile(`(\[|\])(?P[^:]*):(?P[0-9]*)`) altGroups := altRegex.FindStringSubmatch(alt) - chr := variant.Chromosome - pos := variant.Pos + chr := mate1.Chromosome + pos := mate1.Pos chr2 := altGroups[2] pos2, err := strconv.ParseInt(altGroups[3], 0, 64) if err != nil { @@ -56,12 +43,12 @@ func (variant *Variant) toBreakPoint(vcf *VCF) *Variant { } filter := "." - if variant.Filter == mateVariant.Filter { - filter = variant.Filter + if mate1.Filter == mate2.Filter { + filter = mate1.Filter } - varQual, err := strconv.ParseFloat(variant.Qual, 64) - mateQual, err := strconv.ParseFloat(mateVariant.Qual, 64) + varQual, err := strconv.ParseFloat(mate1.Qual, 64) + mateQual, err := strconv.ParseFloat(mate2.Qual, 64) qual := "." if err == nil { qual = fmt.Sprintf("%f", (varQual+mateQual)/2) @@ -70,13 +57,13 @@ func (variant *Variant) toBreakPoint(vcf *VCF) *Variant { breakpointVariant := &Variant{ Chromosome: chr, Pos: pos, - Id: variant.Id, - Ref: variant.Ref, + Id: mate1.Id, + Ref: mate1.Ref, Filter: filter, Qual: qual, - Header: variant.Header, - Info: variant.Info, - Format: variant.Format, + Header: mate1.Header, + Info: mate1.Info, + Format: mate1.Format, } breakpointVariant.Info["END"] = []string{fmt.Sprint(pos2)} @@ -102,14 +89,10 @@ func (variant *Variant) toBreakPoint(vcf *VCF) *Variant { svlen = strconv.FormatFloat(-math.Abs(float64(pos2-pos)), 'g', -1, 64) } - if svtype != "" { - breakpointVariant.Alt = svtype - breakpointVariant.Info["SVTYPE"] = []string{svtype} - breakpointVariant.Info["SVLEN"] = []string{svlen} - return breakpointVariant - } - - return variant + breakpointVariant.Alt = svtype + breakpointVariant.Info["SVTYPE"] = []string{svtype} + breakpointVariant.Info["SVLEN"] = []string{svlen} + return breakpointVariant } // Get the length of an insertion From 5f6a59d74a1a43dbab0ef1b23088727f4e11a3d1 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jan 2024 13:46:31 +0100 Subject: [PATCH 02/10] last fixes --- svync_api/execute.go | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/svync_api/execute.go b/svync_api/execute.go index 997ed6a..243ab9d 100644 --- a/svync_api/execute.go +++ b/svync_api/execute.go @@ -129,14 +129,14 @@ func parseLine( ) { if !strings.HasSuffix(line, "#") && !*headerIsMade { writeHeader(config, Cctx, header, outputFile, stdout) + *headerIsMade = true } if strings.HasPrefix(line, "#") { header.parse(line) } else { id := strings.Split(line, "\t")[2] - variant := createVariant(line) - variant.Header = header + variant := createVariant(line, header) // Convert breakends to breakpoints if the --to-breakpoint flag is set if Cctx.Bool("to-breakpoint") && variant.Info["SVTYPE"][0] == "BND" && len(variant.Info["MATEID"]) == 1 { @@ -150,16 +150,18 @@ func parseLine( } } *variantCount++ + standardizeAndOutput(config, Cctx, variant, outputFile, stdout, *variantCount) // Standardize and output the variant } } // Parse the line and add it to the Variant struct -func createVariant(line string) *Variant { +func createVariant(line string, header *Header) *Variant { logger := log.New(os.Stderr, "", 0) - variant := Variant{} + variant := new(Variant) + variant.Header = header data := strings.Split(line, "\t") variant.Chromosome = data[0] @@ -202,7 +204,7 @@ func createVariant(line string) *Variant { } } - return &variant + return variant } From ed544a04dba987e44ab4f775ffe7d706aea30430 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jan 2024 14:30:27 +0100 Subject: [PATCH 03/10] fix error message --- svync_api/functions.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/svync_api/functions.go b/svync_api/functions.go index 8218e64..5b77449 100644 --- a/svync_api/functions.go +++ b/svync_api/functions.go @@ -39,7 +39,7 @@ func resolveFunction(input string, token string) string { case "len": result += fmt.Sprint(len(value[0])) default: - logger.Fatalf("The function '%s' is not supported", value[1:]) + logger.Fatalf("The function '%s' is not supported", function) } return result } From d8175ff71719c88b82731dcdded9480321f82291 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jan 2024 14:36:51 +0100 Subject: [PATCH 04/10] add a mute-warnings option --- svync.go | 6 ++++++ svync_api/execute.go | 14 ++++++++------ svync_api/resolve.go | 10 +++++++--- svync_api/standardize.go | 6 +++--- 4 files changed, 24 insertions(+), 12 deletions(-) diff --git a/svync.go b/svync.go index 65eeded..551279d 100644 --- a/svync.go +++ b/svync.go @@ -33,6 +33,12 @@ func main() { Usage: "Convert pairs of breakends to a single breakpoint variant. WARNING: this will cause some loss of data.", Category: "Optional", }, + &cli.BoolFlag{ + Name: "mute-warnings", + Aliases: []string{"mw"}, + Usage: "Mute all warnings.", + Category: "Optional", + }, &cli.StringFlag{ Name: "config", Aliases: []string{"c"}, diff --git a/svync_api/execute.go b/svync_api/execute.go index 243ab9d..f4400e1 100644 --- a/svync_api/execute.go +++ b/svync_api/execute.go @@ -136,7 +136,7 @@ func parseLine( header.parse(line) } else { id := strings.Split(line, "\t")[2] - variant := createVariant(line, header) + variant := createVariant(line, header, Cctx) // Convert breakends to breakpoints if the --to-breakpoint flag is set if Cctx.Bool("to-breakpoint") && variant.Info["SVTYPE"][0] == "BND" && len(variant.Info["MATEID"]) == 1 { @@ -157,7 +157,7 @@ func parseLine( } // Parse the line and add it to the Variant struct -func createVariant(line string, header *Header) *Variant { +func createVariant(line string, header *Header, Cctx *cli.Context) *Variant { logger := log.New(os.Stderr, "", 0) variant := new(Variant) @@ -186,7 +186,7 @@ func createVariant(line string, header *Header) *Variant { if len(split) > 1 { value = split[1] } - variant.Info[field] = parseInfoFormat(field, value, variant.Header.Info) + variant.Info[field] = parseInfoFormat(field, value, variant.Header.Info, Cctx) } variant.Format = map[string]VariantFormat{} @@ -200,7 +200,7 @@ func createVariant(line string, header *Header) *Variant { } for idx, val := range strings.Split(value, ":") { header := formatHeaders[idx] - variant.Format[sample].Content[header] = parseInfoFormat(header, val, variant.Header.Format) + variant.Format[sample].Content[header] = parseInfoFormat(header, val, variant.Header.Format, Cctx) } } @@ -209,11 +209,13 @@ func createVariant(line string, header *Header) *Variant { } // Parse the value of the INFO or FORMAT field and return it as a slice of strings -func parseInfoFormat(header string, value string, infoFormatLines map[string]HeaderLineIdNumberTypeDescription) []string { +func parseInfoFormat(header string, value string, infoFormatLines map[string]HeaderLineIdNumberTypeDescription, Cctx *cli.Context) []string { logger := log.New(os.Stderr, "", 0) headerLine := infoFormatLines[header] if headerLine == (HeaderLineIdNumberTypeDescription{}) { - logger.Printf("Field %s not found in header, defaulting to Type 'String' and Number '1'", header) + if !Cctx.Bool("mute-warnings") { + logger.Printf("Field %s not found in header, defaulting to Type 'String' and Number '1'", header) + } headerLine = HeaderLineIdNumberTypeDescription{ Id: header, Number: "1", diff --git a/svync_api/resolve.go b/svync_api/resolve.go index 747ac6b..357bc3f 100644 --- a/svync_api/resolve.go +++ b/svync_api/resolve.go @@ -7,10 +7,12 @@ import ( "regexp" "strconv" "strings" + + cli "github.com/urfave/cli/v2" ) // Resolve a value -func ResolveValue(input string, variant *Variant, format *VariantFormat) string { +func ResolveValue(input string, variant *Variant, format *VariantFormat, Cctx *cli.Context) string { logger := log.New(os.Stderr, "", 0) // Replace all the FORMAT fields @@ -27,7 +29,9 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat) string // TODO implement some alternative way to handle missing fields if !ok { - logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant", field, variant.Id) + if !Cctx.Bool("mute-warnings") { + logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant", field, variant.Id) + } } else if len(fieldSlice) > 2 { index, err := strconv.ParseInt(fieldSlice[2], 0, 64) if err != nil { @@ -50,7 +54,7 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat) string // TODO implement some alternative way to handle missing fields if !ok { infoType := variant.Header.Info[field].Type - if infoType != "Flag" { + if infoType != "Flag" && !Cctx.Bool("mute-warnings") { logger.Printf("The field %s is not present in the INFO fields of the variant with ID %s, excluding it from this variant", field, variant.Id) } } else if len(fieldSlice) > 2 { diff --git a/svync_api/standardize.go b/svync_api/standardize.go index aa0c445..bc5ae77 100644 --- a/svync_api/standardize.go +++ b/svync_api/standardize.go @@ -102,7 +102,7 @@ func (variant *Variant) standardize(config *Config, Cctx *cli.Context, count int } } - standardizedVariant.Id = fmt.Sprintf("%s_%v", ResolveValue(config.Id, variant, nil), count) + standardizedVariant.Id = fmt.Sprintf("%s_%v", ResolveValue(config.Id, variant, nil, Cctx), count) // Add info fields for name, infoConfig := range config.Info { @@ -114,7 +114,7 @@ func (variant *Variant) standardize(config *Config, Cctx *cli.Context, count int if value == "" { continue } - standardizedVariant.Info[name] = []string{ResolveValue(value, variant, nil)} + standardizedVariant.Info[name] = []string{ResolveValue(value, variant, nil, Cctx)} } // Add format fields @@ -127,7 +127,7 @@ func (variant *Variant) standardize(config *Config, Cctx *cli.Context, count int if val, ok := formatConfig.Alts[sVType]; ok { value = val } - newFormat.Content[name] = []string{ResolveValue(value, variant, &format)} + newFormat.Content[name] = []string{ResolveValue(value, variant, &format, Cctx)} } standardizedVariant.Format[sample] = *newFormat } From 7b66e81ca5526bf73e559ced574512a0c29c83ed Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jan 2024 14:39:08 +0100 Subject: [PATCH 05/10] remove to breakend code --- svync_api/variant.go | 71 -------------------------------------------- 1 file changed, 71 deletions(-) diff --git a/svync_api/variant.go b/svync_api/variant.go index b861636..37664b7 100644 --- a/svync_api/variant.go +++ b/svync_api/variant.go @@ -6,7 +6,6 @@ import ( "math" "os" "regexp" - "slices" "strconv" "strings" ) @@ -102,73 +101,3 @@ func getInsLen(alt string, strand string, bracket string) int { } return len(alt[:strings.LastIndex(alt, bracket)]) } - -// Convert a breakpoint to variant to its breakend pairs -// -// Determining the ALT fields is impossible for breakpoint -> breakend conversion -func (variant *Variant) toBreakEnd() (*Variant, *Variant) { - logger := log.New(os.Stderr, "", 0) - - id1 := variant.Id + "_01" - id2 := variant.Id + "_02" - - end, err := strconv.ParseInt(variant.Info["END"][0], 0, 64) - if err != nil { - logger.Fatal(err) - } - - chr := variant.Chromosome - chr2 := "" - chrom2, ok := variant.Info["CHR2"] - if !ok { - chr2 = variant.Chromosome - } else { - chr2 = chrom2[0] - } - - info1 := map[string][]string{} - info2 := map[string][]string{} - for name, info := range variant.Info { - if slices.Contains([]string{"CHR2", "END", "SVLEN"}, name) { - continue - } - info1[name] = info - info2[name] = info - } - - info1["SVTYPE"] = []string{"BND"} - info2["SVTYPE"] = []string{"BND"} - info1["MATEID"] = []string{id2} - info2["MATEID"] = []string{id1} - - alt1 := "." - alt2 := "." - - breakend1 := &Variant{ - Chromosome: chr, - Pos: variant.Pos, - Id: id1, - Alt: alt1, - Ref: variant.Ref, - Qual: variant.Qual, - Filter: variant.Filter, - Header: variant.Header, - Info: info1, - Format: variant.Format, - } - - breakend2 := &Variant{ - Chromosome: chr2, - Pos: end, - Id: id2, - Alt: alt2, - Ref: "N", // TODO find a good way to determine the breakend2 REF - Qual: variant.Qual, - Filter: variant.Filter, - Header: variant.Header, - Info: info2, - Format: variant.Format, - } - - return breakend1, breakend2 -} From 9d6f24717e28ef390d6f6dd06a3f112f4117f160 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Tue, 16 Jan 2024 16:32:26 +0100 Subject: [PATCH 06/10] swap mates if necessary --- svync_api/variant.go | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/svync_api/variant.go b/svync_api/variant.go index 37664b7..7eaaba5 100644 --- a/svync_api/variant.go +++ b/svync_api/variant.go @@ -2,9 +2,7 @@ package svync_api import ( "fmt" - "log" "math" - "os" "regexp" "strconv" "strings" @@ -12,19 +10,20 @@ import ( // Convert a breakend variant pair to one breakpoint func toBreakPoint(mate1 *Variant, mate2 *Variant) *Variant { - logger := log.New(os.Stderr, "", 0) - alt := mate1.Alt - altRegex := regexp.MustCompile(`(\[|\])(?P[^:]*):(?P[0-9]*)`) - altGroups := altRegex.FindStringSubmatch(alt) + // Swap mates if the first one comes after the second one + if mate1.Chromosome == mate2.Chromosome && mate1.Pos > mate2.Pos { + mate1, mate2 = mate2, mate1 + } + alt := mate1.Alt chr := mate1.Chromosome pos := mate1.Pos - chr2 := altGroups[2] - pos2, err := strconv.ParseInt(altGroups[3], 0, 64) - if err != nil { - logger.Fatalf("Couldn't convert string to integer: %v", err) - } + chr2 := mate2.Chromosome + pos2 := mate2.Pos + + altRegex := regexp.MustCompile(`(\[|\])(?P[^:]*):(?P[0-9]*)`) + altGroups := altRegex.FindStringSubmatch(alt) bracket := altGroups[1] strand1 := "" strand2 := "" From 2775ce5e7062a2d8d05b87e8541745baa5d0aaeb Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Fri, 19 Jan 2024 15:04:39 +0100 Subject: [PATCH 07/10] remove --to-breakpoint for now --- svync.go | 15 ++++++++------- svync_api/execute.go | 23 ++++++++++++----------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/svync.go b/svync.go index 551279d..eb5b780 100644 --- a/svync.go +++ b/svync.go @@ -27,12 +27,13 @@ func main() { Usage: "The location to the output VCF file, defaults to stdout", Category: "Optional", }, - &cli.BoolFlag{ - Name: "to-breakpoint", - Aliases: []string{"tb"}, - Usage: "Convert pairs of breakends to a single breakpoint variant. WARNING: this will cause some loss of data.", - Category: "Optional", - }, + // TODO re-add this when conversion is implemented + // &cli.BoolFlag{ + // Name: "to-breakpoint", + // Aliases: []string{"tb"}, + // Usage: "Convert pairs of breakends to a single breakpoint variant. WARNING: this will cause some loss of data.", + // Category: "Optional", + // }, &cli.BoolFlag{ Name: "mute-warnings", Aliases: []string{"mw"}, @@ -42,7 +43,7 @@ func main() { &cli.StringFlag{ Name: "config", Aliases: []string{"c"}, - Usage: "Configuration file (YAML) to use for the parsing of INFO and FORMAT fields", + Usage: "Configuration file (YAML) used for standardizing the VCF", Required: true, Category: "Required", }, diff --git a/svync_api/execute.go b/svync_api/execute.go index f4400e1..f04c8db 100644 --- a/svync_api/execute.go +++ b/svync_api/execute.go @@ -135,20 +135,21 @@ func parseLine( if strings.HasPrefix(line, "#") { header.parse(line) } else { - id := strings.Split(line, "\t")[2] + // id := strings.Split(line, "\t")[2] variant := createVariant(line, header, Cctx) + // TODO continue work on this later // Convert breakends to breakpoints if the --to-breakpoint flag is set - if Cctx.Bool("to-breakpoint") && variant.Info["SVTYPE"][0] == "BND" && len(variant.Info["MATEID"]) == 1 { - mateid := variant.Info["MATEID"][0] - if mate, ok := (*breakEndVariants)[mateid]; ok { - variant = toBreakPoint(variant, &mate) - delete(*breakEndVariants, mateid) - } else { - (*breakEndVariants)[id] = *variant - return - } - } + // if Cctx.Bool("to-breakpoint") && variant.Info["SVTYPE"][0] == "BND" && len(variant.Info["MATEID"]) == 1 { + // mateid := variant.Info["MATEID"][0] + // if mate, ok := (*breakEndVariants)[mateid]; ok { + // variant = toBreakPoint(variant, &mate) + // delete(*breakEndVariants, mateid) + // } else { + // (*breakEndVariants)[id] = *variant + // return + // } + // } *variantCount++ standardizeAndOutput(config, Cctx, variant, outputFile, stdout, *variantCount) From 8d68e861b9696a7aca82659f22525e017f6a9488 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Mon, 22 Jan 2024 10:43:52 +0100 Subject: [PATCH 08/10] small update to readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index a5fc68f..c4b5a35 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,6 @@ svync --config --input | --- | --- | --- | | `--output`/`-o` | Path to the output VCF file | `stdout` | | `--nodate`/`--nd` | Do not add the date to the output VCF file | `false` | -| `--notation`/`-n` | The notation to use for the output VCF file. Must be one of: breakpoint, breakend. | none | ## Configuration The configuration file is the core of the standardization in Svync. More information can be found in the [configuration documentation](docs/configuration.md). From 1a2aacd85c9b7c145d259d25e051dd812e636b50 Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Mon, 22 Jan 2024 10:58:52 +0100 Subject: [PATCH 09/10] add mute warnings to readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c4b5a35..21b8a8d 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ svync --config --input | --- | --- | --- | | `--output`/`-o` | Path to the output VCF file | `stdout` | | `--nodate`/`--nd` | Do not add the date to the output VCF file | `false` | +| `--mute-warnings`/`--mw` | Do not output warnings | `false` | ## Configuration The configuration file is the core of the standardization in Svync. More information can be found in the [configuration documentation](docs/configuration.md). From efd6b02889dc2f2ba0326eec0b7ff729b5865ddf Mon Sep 17 00:00:00 2001 From: Nicolas Vannieuwkerke Date: Mon, 22 Jan 2024 11:10:08 +0100 Subject: [PATCH 10/10] fix actions not running --- .github/workflows/go.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index efd3443..606cd7c 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -5,9 +5,9 @@ name: Go on: push: - branches: [ "master" ] + branches: [ "main", "dev" ] pull_request: - branches: [ "master" ] + branches: [ "main", "dev" ] jobs: