diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b77ac4f..2d1699f9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # PanGraph Changelog +## v0.7.3 + +- bugfix in graph building: a particular edge-case would cause minor inconsistencies in the block alignment when merging graphs, see issue [#62](https://github.com/neherlab/pangraph/issues/62) and PR [#63](https://github.com/neherlab/pangraph/pull/63). + ## v0.7.2 - minor fix in tree midpoint rooting during panX export, see [#59](https://github.com/neherlab/pangraph/issues/59). diff --git a/Project.toml b/Project.toml index ec577900..f1fbe56d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PanGraph" uuid = "0f9f61ca-f32c-45e1-b3bc-00138f4f8814" authors = ["Nicholas Noll "] -version = "0.7.2" +version = "0.7.3" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md index ee6ae020..d6ea9ae3 100644 --- a/docs/src/cli/build.md +++ b/docs/src/cli/build.md @@ -17,6 +17,8 @@ Build a multiple sequence alignment pangraph. | alignment kernel | String | k | alignment-kernel | only accepts "minimap2" or "mmseqs" | | kmer length (mmseqs) | Integer | K | kmer-length | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default. | | consistency check | Boolean | t | test | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph | +| verbose mode | Boolean | v | verbose | toggle to activate verbose mode | +| debug mode | Boolean | D | debug | toggle to activate debug mode: during merging intermediate graphs are saved in the `debug` folder | | random seed | Int | r | random-seed | random seed for pangraph construction. | ## Arguments diff --git a/src/align.jl b/src/align.jl index 888f087e..30d568ea 100644 --- a/src/align.jl +++ b/src/align.jl @@ -463,6 +463,7 @@ function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Funct merges = preprocess(hits, skip, energy, block) length(merges) > 0 || break + # dictionary block-id => block blocks = align_kernel(merges, minblock, replace, verbose) merge!(blocks, G₀.block) @@ -593,6 +594,7 @@ function align_pair(G₁::Graph, G₂::Graph, energy::Function, minblock::Int, a merges = preprocess(hits, skip, energy, block) + # dictionary block-id => block blocks = align_kernel(merges, minblock, replace, verbose) sequence = merge(G₁.sequence, G₂.sequence) @@ -618,7 +620,7 @@ end # TODO: the associative array is a bit hacky... # can we push it directly into the channel? """ - align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100) + align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, debugdir=nothing) Aligns a collection of graphs `Gs` using the specified `aligner` function to recover hits. Graphs are aligned following an internal guide tree, generated using kmer distance. @@ -631,7 +633,7 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co `compare` is the function to be used to generate pairwise distances that generate the internal guide tree. """ -function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0) +function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0, debugdir=nothing) function verify(graph; msg="") if reference !== nothing for (name,path) ∈ graph.sequence @@ -718,7 +720,7 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h # the lock ensures that at most N=Threads.nthreads() processes are # spawning run(`cmd`) instances at the same time G₀ = lock_semaphore(s) do - verbose && log("--> align-pair for clade n. $n_clade") + verbose && log("--> align-pair for clade n. $n_clade from $(Pₗ) and $(Pᵣ)") G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, verbose) verbose && log("--> align-self for clade n. $n_clade") G₀ = align_self(G₀, energy, minblock, aligner, verify, verbose, maxiter=maxiter) @@ -726,12 +728,18 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h G₀ end - # DEBUG : save graph at each iteration in a file - # open("issue/comp/graph_iteration_$(n_clade).json", "w") do io - # finalize!(G₀) - # marshal(io, G₀; fmt=:json) - # end - + # if debug: save graph at each iteration in a file + if debugdir !== nothing + open("$(debugdir)/graph_iteration_$(n_clade).json", "w") do io + finalize!(G₀) + marshal(io, G₀; fmt=:json) + end + # if the file exists, remove input intermediate files that were successfully merged + fl = "$(debugdir)/graph_iteration_$(Pₗ).json" + fr = "$(debugdir)/graph_iteration_$(Pᵣ).json" + isfile(fl) && rm(fl) + isfile(fr) && rm(fr) + end # advance progress bar in a thread-safe way lock(meter_lock) do diff --git a/src/block.jl b/src/block.jl index 130596ab..ac33a27e 100644 --- a/src/block.jl +++ b/src/block.jl @@ -957,6 +957,22 @@ function reconsensus!(b::Block) return true end +# """ +# debug function that saves all of the block information on a text file. +# """ +# function all_block_info(io::IO, b::Block) +# println(io, "Block ID: ", b.uuid) +# println(io, "Sequence: ", b.sequence .|> Char |> join) +# println(io, "Gaps: ", b.gaps) +# for (name, dict) in zip(["Mutations", "Insertions", "Deletions"], [b.mutate, b.insert, b.delete]) +# println(io, "$name ---") +# for (node, value) in dict +# println(io, "\tNode: ", hash(node)) +# println(io, "\t\t", value) +# end +# end +# end + # TODO: align consensus sequences within overlapping gaps of qry and ref. # right now we parsimoniously stuff all sequences at the beginning of gaps # problems: @@ -998,7 +1014,6 @@ function rereference(qry::Block, ref::Block, segments) # TODO: allow for (-) hamming alignments gap = gapconsensus(qry, x.qry-1) pos = hamming_align(gap, ref.sequence[Δ])-1 - newgap = (Δ.stop, 0) for node ∈ keys(qry) @@ -1015,7 +1030,7 @@ function rereference(qry::Block, ref::Block, segments) start = Δ.start + pos + δ stop = start + length(ins) - 1 - if 1 ≤ start ≤ Δ.stop + if 1 ≤ start ≤ Δ.stop for i ∈ start:min(Δ.stop,stop) if ins[i-start+1] != ref.sequence[i] if node ∉ keys(combined.mutate) @@ -1080,7 +1095,7 @@ function rereference(qry::Block, ref::Block, segments) end newgap = nothing - else + else # no insertions in qry, just add as new deletion newdeletes = DelDict(node => Dict(x.ref=>Δ.stop-Δ.start+1) for node ∈ keys(qry)) merge!(combined.delete, newdeletes) end @@ -1088,9 +1103,32 @@ function rereference(qry::Block, ref::Block, segments) x = (qry=x.qry, ref=Δ.stop+1) end (Δ, nothing) => let # sequence in qry consensus not found in ref consensus - mutate = translate(lociwithin(qry.mutate,Δ),1-Δ.start) - insert = translate(lociwithin(qry.insert,Δ),1-Δ.start) - delete = translate(lociwithin(qry.delete,Δ),1-Δ.start) + mutate = lociwithin(qry.mutate,Δ) + insert = lociwithin(qry.insert,Δ) + delete = lociwithin(qry.delete,Δ) + + # delete all query insertions that will be accounted for in this segment + del_gaps = Int[] + for (node, subdict) in insert + for (locus, nuc) in subdict + # @assert locus[1] in keys(qry.gaps) + # append locus to del_gaps + push!(del_gaps, locus[1]) + delete!(qry.insert[node], locus) + end + end + + # remove corresponding query gaps + for dg in unique(del_gaps) + # @assert dg ∉ [locus[1] for (node, subdict) ∈ qry.insert for (locus, nuc) ∈ keys(subdict)] + delete!(qry.gaps, dg) + end + + # shift variations on the segment start + mutate = translate(mutate,1-Δ.start) + insert = translate(insert,1-Δ.start) + delete = translate(delete,1-Δ.start) + if (x.ref-1) ∈ keys(newgaps) # TODO: more sophisticated alignment? have to worry about overriding alignment δ = newgaps[x.ref-1] #hamming_align(qry.sequence[Δ], gapconsensus(combined.insert, newgaps[x.ref-1], x.ref-1)) - 1 @@ -1109,13 +1147,13 @@ function rereference(qry::Block, ref::Block, segments) if (δ+length(seq)) > last(newgap) newgap = (first(newgap),length(seq)+δ) end - node => InsMap((x.ref-1,δ) => seq) + node => InsMap((x.ref-1,δ) => seq) else node => InsMap() end end for node ∈ keys(qry) ) - + if last(newgap) > 0 if newgap[1] ∉ keys(newgaps) || newgap[2] > newgaps[newgap[1]] newgaps[newgap[1]] = newgap[2] @@ -1143,7 +1181,7 @@ function rereference(qry::Block, ref::Block, segments) ) for node ∈ keys(qry) ) - # XXX: hacky way to ensure deletions are not inclued in newmuts + # XXX: hacky way to ensure deletions are not included in newmuts newdels = map(qry.delete,Δq,Δr) for (node, subdict) in newdels for (pos, len) in subdict diff --git a/src/build.jl b/src/build.jl index c7e9210b..8682ca3c 100644 --- a/src/build.jl +++ b/src/build.jl @@ -84,6 +84,20 @@ Build = Command( "toggle to activate consistency check at each step of the graph merging process", false, ), + Arg( + Bool, + "debug mode", + (short = "-D", long = "--debug"), + "toggle to activate debug mode: saves intermediate graphs in `debug` folder", + false, + ), + Arg( + Bool, + "verbose mode", + (short = "-v", long = "--verbose"), + "toggle to activate verbose mode", + false, + ), Arg( Int, "random seed", @@ -93,6 +107,8 @@ Build = Command( ), ], (args) -> let + + # parse input files files = parse(Build, args) files = if files === nothing || length(files) == 0 ["/dev/stdin"] @@ -100,9 +116,28 @@ Build = Command( files end + # if verbose mode: print all arguments + verbose = arg(Build, "-v") + + if verbose + println(stderr, "pangraph build command arguments:") + for arg in Build.arg + println(stderr, "\t", arg.flag.long, " = ", arg.value) + end + end + + debugdir = nothing + if arg(Build, "-D") + # create debug directory + debugdir = joinpath(pwd(), "debug") + isdir(debugdir) || mkdir(debugdir) + end + + # parse command arguments minblock = arg(Build, "-l") circular = arg(Build, "-c") uppercase = arg(Build, "-u") + verbose = arg(Build, "-v") α = arg(Build, "-a") β = arg(Build, "-b") @@ -186,8 +221,9 @@ Build = Command( minblock = minblock, maxiter = maxiter, reference = reference, - verbose = false, + verbose = verbose, rand_seed = r_seed, + debugdir = debugdir, ) finalize!(graph)