Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor #1

Merged
merged 10 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ name: Go

on:
push:
branches: [ "master" ]
branches: [ "main", "dev" ]
pull_request:
branches: [ "master" ]
branches: [ "main", "dev" ]

jobs:

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ svync --config <config.yaml> --input <input.vcf>
| --- | --- | --- |
| `--output`/`-o` | Path to the output VCF file | `stdout` |
| `--nodate`/`--nd` | Do not add the date to the output VCF file | `false` |
| `--notation`/`-n` | The notation to use for the output VCF file. Must be one of: breakpoint, breakend. | none |
| `--mute-warnings`/`--mw` | Do not output warnings | `false` |

## Configuration
The configuration file is the core of the standardization in Svync. More information can be found in the [configuration documentation](docs/configuration.md).
Expand Down
29 changes: 13 additions & 16 deletions svync.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ package main
import (
"log"
"os"
"slices"
"strings"

"github.com/nvnieuwk/svync/svync_api"
cli "github.com/urfave/cli/v2"
Expand All @@ -29,23 +27,23 @@ func main() {
Usage: "The location to the output VCF file, defaults to stdout",
Category: "Optional",
},
&cli.StringFlag{
Name: "notation",
Aliases: []string{"n"},
Usage: "The notation to use for the output VCF file. Must be one of: breakpoint, breakend. By default the notation isn't changed",
// TODO re-add this when conversion is implemented
// &cli.BoolFlag{
// Name: "to-breakpoint",
// Aliases: []string{"tb"},
// Usage: "Convert pairs of breakends to a single breakpoint variant. WARNING: this will cause some loss of data.",
// Category: "Optional",
// },
&cli.BoolFlag{
Name: "mute-warnings",
Aliases: []string{"mw"},
Usage: "Mute all warnings.",
Category: "Optional",
Action: func(c *cli.Context, input string) error {
validNotations := []string{"breakpoint", "breakend"}
if slices.Contains(validNotations, input) {
return nil
}
return cli.Exit("Invalid notation '"+input+"', must be one of: "+strings.Join(validNotations, ", "), 1)
},
},
&cli.StringFlag{
Name: "config",
Aliases: []string{"c"},
Usage: "Configuration file (YAML) to use for the parsing of INFO and FORMAT fields",
Usage: "Configuration file (YAML) used for standardizing the VCF",
Required: true,
Category: "Required",
},
Expand All @@ -59,8 +57,7 @@ func main() {
},
Action: func(Cctx *cli.Context) error {
config := svync_api.ReadConfig(Cctx)
vcf := svync_api.ReadVcf(Cctx)
vcf.StandardizeAndOutput(config, Cctx) // Standardize VCF and write to output file
svync_api.Execute(Cctx, config)
return nil
},
}
Expand Down
196 changes: 124 additions & 72 deletions svync_api/read.go → svync_api/execute.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,60 +15,80 @@ import (
)

// Read the VCF file and return it as a VCF struct
func ReadVcf(Cctx *cli.Context) *VCF {
func Execute(Cctx *cli.Context, config *Config) {
logger := log.New(os.Stderr, "", 0)

file := Cctx.String("input")
openFile, err := os.Open(file)
defer openFile.Close()
inputVcf, err := os.Open(file)
defer inputVcf.Close()
if err != nil {
logger.Fatal(err)
}

vcf := newVCF()
if strings.HasSuffix(file, ".gz") {
vcf.readBgzip(openFile)
} else {
vcf.readPlain(openFile)
}

return vcf
}

// Initialize a new VCF
func newVCF() *VCF {
return &VCF{
Header: Header{
Info: map[string]HeaderLineIdNumberTypeDescription{},
Format: map[string]HeaderLineIdNumberTypeDescription{},
Alt: map[string]HeaderLineIdDescription{},
Filter: map[string]HeaderLineIdDescription{},
Contig: []HeaderLineIdLength{},
},
Variants: map[string]Variant{},
}
}

// Read the VCF file in bgzip format and convert it to a VCF struct
func (vcf *VCF) readBgzip(input *os.File) {
logger := log.New(os.Stderr, "", 0)

bgReader, err := bgzf.NewReader(input, 1)
if err != nil {
logger.Fatal(err)
header := newHeader()
breakEndVariants := &map[string]Variant{}
headerIsMade := false
variantCount := 0

stdout := true
var outputFile *os.File
if Cctx.String("output") != "" {
stdout = false
outputFile, err = os.Create(Cctx.String("output"))
if err != nil {
logger.Fatalf("Failed to create the output file: %v", err)
}
defer outputFile.Close()
}
defer bgReader.Close()

for {
b, _, err := readBgzipLine(bgReader)
if strings.HasSuffix(file, ".gz") {
bgReader, err := bgzf.NewReader(inputVcf, 1)
if err != nil {
if err == io.EOF {
break
logger.Fatal(err)
}
defer bgReader.Close()

for {
b, _, err := readBgzipLine(bgReader)
if err != nil {
if err == io.EOF {
break
}
logger.Fatal(string(b[:]))
}
logger.Fatal(string(b[:]))

parseLine(
string(bytes.TrimSpace(b[:])),
header,
breakEndVariants,
config,
Cctx,
&headerIsMade,
outputFile,
stdout,
&variantCount,
)
}
} else {
scanner := bufio.NewScanner(inputVcf)
const maxCapacity = 8 * 1000000 // 8 MB
scanner.Buffer(make([]byte, maxCapacity), maxCapacity)
for scanner.Scan() {
parseLine(
scanner.Text(),
header,
breakEndVariants,
config,
Cctx,
&headerIsMade,
outputFile,
stdout,
&variantCount,
)
}

vcf.parse(string(bytes.TrimSpace(b[:])))
if err := scanner.Err(); err != nil {
logger.Fatal(err)
}
}

}
Expand All @@ -95,44 +115,59 @@ func readBgzipLine(r *bgzf.Reader) ([]byte, bgzf.Chunk, error) {
return data, chunk, err
}

// Read the VCF file in plain text format and convert it to a VCF struct
func (vcf *VCF) readPlain(input *os.File) {
logger := log.New(os.Stderr, "", 0)

scanner := bufio.NewScanner(input)
const maxCapacity = 8 * 1000000 // 8 MB
scanner.Buffer(make([]byte, maxCapacity), maxCapacity)
for scanner.Scan() {
vcf.parse(scanner.Text())
}

if err := scanner.Err(); err != nil {
logger.Fatal(err)
// Parse the line and add it to the VCF struct
func parseLine(
line string,
header *Header,
breakEndVariants *map[string]Variant,
config *Config,
Cctx *cli.Context,
headerIsMade *bool,
outputFile *os.File,
stdout bool,
variantCount *int,
) {
if !strings.HasSuffix(line, "#") && !*headerIsMade {
writeHeader(config, Cctx, header, outputFile, stdout)
*headerIsMade = true
}

}

// Parse the line and add it to the VCF struct
func (vcf *VCF) parse(line string) {
if strings.HasPrefix(line, "#") {
vcf.Header.parse(line)
header.parse(line)
} else {
id := strings.Split(line, "\t")[2]
variant := &Variant{}
variant.Header = &vcf.Header
variant.parse(line)
vcf.Variants[id] = *variant
// logger.Println(vcf.Variants[id])
// id := strings.Split(line, "\t")[2]
variant := createVariant(line, header, Cctx)

// TODO continue work on this later
// Convert breakends to breakpoints if the --to-breakpoint flag is set
// if Cctx.Bool("to-breakpoint") && variant.Info["SVTYPE"][0] == "BND" && len(variant.Info["MATEID"]) == 1 {
// mateid := variant.Info["MATEID"][0]
// if mate, ok := (*breakEndVariants)[mateid]; ok {
// variant = toBreakPoint(variant, &mate)
// delete(*breakEndVariants, mateid)
// } else {
// (*breakEndVariants)[id] = *variant
// return
// }
// }
*variantCount++
standardizeAndOutput(config, Cctx, variant, outputFile, stdout, *variantCount)

// Standardize and output the variant
}
}

// Parse the line and add it to the Variant struct
func (variant *Variant) parse(line string) {
func createVariant(line string, header *Header, Cctx *cli.Context) *Variant {
logger := log.New(os.Stderr, "", 0)

err := error(nil)
variant := new(Variant)
variant.Header = header

data := strings.Split(line, "\t")
variant.Chromosome = data[0]

var err error
variant.Pos, err = strconv.ParseInt(data[1], 0, 64)
if err != nil {
logger.Fatal(err)
Expand All @@ -152,7 +187,7 @@ func (variant *Variant) parse(line string) {
if len(split) > 1 {
value = split[1]
}
variant.Info[field] = parseInfoFormat(field, value, variant.Header.Info)
variant.Info[field] = parseInfoFormat(field, value, variant.Header.Info, Cctx)
}

variant.Format = map[string]VariantFormat{}
Expand All @@ -166,18 +201,22 @@ func (variant *Variant) parse(line string) {
}
for idx, val := range strings.Split(value, ":") {
header := formatHeaders[idx]
variant.Format[sample].Content[header] = parseInfoFormat(header, val, variant.Header.Format)
variant.Format[sample].Content[header] = parseInfoFormat(header, val, variant.Header.Format, Cctx)
}
}

return variant

}

// Parse the value of the INFO or FORMAT field and return it as a slice of strings
func parseInfoFormat(header string, value string, infoFormatLines map[string]HeaderLineIdNumberTypeDescription) []string {
func parseInfoFormat(header string, value string, infoFormatLines map[string]HeaderLineIdNumberTypeDescription, Cctx *cli.Context) []string {
logger := log.New(os.Stderr, "", 0)
headerLine := infoFormatLines[header]
if headerLine == (HeaderLineIdNumberTypeDescription{}) {
logger.Printf("Field %s not found in header, defaulting to Type 'String' and Number '1'", header)
if !Cctx.Bool("mute-warnings") {
logger.Printf("Field %s not found in header, defaulting to Type 'String' and Number '1'", header)
}
headerLine = HeaderLineIdNumberTypeDescription{
Id: header,
Number: "1",
Expand Down Expand Up @@ -287,3 +326,16 @@ func convertLineToMap(line string) map[string]string {

return data
}

// Create a new header struct
func newHeader() *Header {
return &Header{
Info: map[string]HeaderLineIdNumberTypeDescription{},
Format: map[string]HeaderLineIdNumberTypeDescription{},
Alt: map[string]HeaderLineIdDescription{},
Filter: map[string]HeaderLineIdDescription{},
Contig: []HeaderLineIdLength{},
Other: []string{},
Samples: []string{},
}
}
2 changes: 1 addition & 1 deletion svync_api/functions.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ func resolveFunction(input string, token string) string {
case "len":
result += fmt.Sprint(len(value[0]))
default:
logger.Fatalf("The function '%s' is not supported", value[1:])
logger.Fatalf("The function '%s' is not supported", function)
}
return result
}
Expand Down
10 changes: 7 additions & 3 deletions svync_api/resolve.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ import (
"regexp"
"strconv"
"strings"

cli "github.com/urfave/cli/v2"
)

// Resolve a value
func ResolveValue(input string, variant *Variant, format *VariantFormat) string {
func ResolveValue(input string, variant *Variant, format *VariantFormat, Cctx *cli.Context) string {
logger := log.New(os.Stderr, "", 0)

// Replace all the FORMAT fields
Expand All @@ -27,7 +29,9 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat) string

// TODO implement some alternative way to handle missing fields
if !ok {
logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant", field, variant.Id)
if !Cctx.Bool("mute-warnings") {
logger.Printf("The field %s is not present in the FORMAT fields of the variant with ID %s, excluding it from this variant", field, variant.Id)
}
} else if len(fieldSlice) > 2 {
index, err := strconv.ParseInt(fieldSlice[2], 0, 64)
if err != nil {
Expand All @@ -50,7 +54,7 @@ func ResolveValue(input string, variant *Variant, format *VariantFormat) string
// TODO implement some alternative way to handle missing fields
if !ok {
infoType := variant.Header.Info[field].Type
if infoType != "Flag" {
if infoType != "Flag" && !Cctx.Bool("mute-warnings") {
logger.Printf("The field %s is not present in the INFO fields of the variant with ID %s, excluding it from this variant", field, variant.Id)
}
} else if len(fieldSlice) > 2 {
Expand Down
Loading