proovread.cfg

## proovread Config File

##-- style -------------------------------------------------------------------##

## This file can be digested by proovread when provided with -c/--cfg.  Options
##  supersede proovread default values but will be overwritten by command line
##  supplied values. The content is directly evaluated as perl code (list in
##  hash context), therefore proper syntax is crutial:
## * "#" starts a comment, which is entirely ignored by the program, on
##   creation, every parameter is commented out, remove the "#" at the beginning
##   of the key => value line if you want your modifications to have effect
## * Keys/Strings need to be enclosed '' or ""
## * Each key requires a value
## * undef (without ''/"") is a perl keyword, use it if value is unknown/ not
##   available
## * each key => value pair has to be followed by a ","
## * white space has no effect but better readability
## * since parameter are evaled, you can use code snippets to determine
##   parameter dynamically, e.g. to auto-detect the maximum number of threads
##   available replace: 'threads' => 8, by 'threads' => qx(grep '^processor'
##   /proc/cpuinfo | wc -l) =~ s/\n//r, or use perl glob(*.fq) to auto-detect
##   input files...
## * if you want to create a customized, slim version of this config, the only
##   requirement is that it forms a valid perl list of alternating keys and
##   values with an even number of arguments total. E.g. these one liners would
##   make equivalent, complete and proper configs:
##
##     'long-reads',"LR.fa",'short-reads',"SR.fq"
##   or 
##     'long-reads' => "LR.fa", 'short-reads' => "SR.fq"
##
## * Constrasting to the command line behaviour, keys cannot be abbreviated


##-- command line parameter --------------------------------------------------##

## LIST of Pacbio read files to correct. FASTA or FASTQ format.
'long-reads' => [],

## LIST of high confidence short read files used for correction in FASTQ or
##  FASTA format.
'short-reads' => [],

## Prefix to output files. Defaults to 'proovread'
'prefix' => undef,

## Coverage cutoff for highest scoring mappings at each location.
'coverage' => 50,

## Number of threads to use for mapping. Defaults to 8 (or maximum available
##  number of processors, if kess than 8 availabe).
'threads' => 4,
## to auto-detect max processors use:
##  'threads' => qx(grep '^processor' /proc/cpuinfo | wc -l) =~ s/\n//r,

## 
'mode' => 'auto', # for available values see 'mode-tasks' below

## Use already created mapping in SAM/BAM format to create corrected consensus
##  sequences from. Use this as alternative input to short-reads. If --sam/--bam
##  is specified, --mode is automatically set to sam/bam.
'sam' => undef,
'bam' => undef,

## Sort the filtered SAM files by coordinates in addition to the sorting of
##  references. This has no effect on the pipeline, and is just a convenience if
##  you need the files for something else.
'sort-sam-by-coordinates' => undef,

## Specify '1' to keep temporary file of each pass, '2' to also keep the
##  individual temporary file of each thread.
'keep-temporary-files' => 0,  # 0,1,2

## overwrite exiting output folder
'overwrite' => 0,

## shrimp2 (gmapper-ls) path, set '' to look in PATH
## dont forget the trailing /
'shrimp-path' => $RealBin.'/../util/shrimp-2.2.3/',

## bwa path, set '' to look in PATH
## dont forget the trailing /
'bwa-path' => $RealBin.'/../util/bwa/',

## blasr path, set '' to look in PATH
## dont forget the trailing /
'blasr-path' => $RealBin.'/../util/blasr-1.3.1/',

## dazzlrt paths, set '' to look in PATH
## dont forget the trailing /
'daligner-path' => '',
'dazz-db-path' => '',

## samtools path, assume exported 
'samtools-path' => '',

## blast path, assume exported
'blast-path' => '',

##-- advanced parameter --------------------------------------------------##
## don't mess with these unless you know what you are doing

##-- task settings -------------------------------------------------------##

'mode-tasks' => {
	# Recommended for HiSeq reads (75-150 bp)
	'sr' => ['read-long', 'ccs-1', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
        # Recommended for MiSeq reads (150-600 bp), 454, sanger, pacbio consensus reads ...
	'mr' => ['read-long', 'ccs-1', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
        # Recommended with unitigs and HiSeq reads (75-150 bp)
        'sr+utg' => ['read-long', 'ccs-1', 'blasr-utg', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
        # Recommended with unitigs MiSeq reads (150-600 bp), 454, sanger, pacbio consensus reads ...
        'mr+utg' => ['read-long', 'ccs-1', 'blasr-utg', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
        # Recommended with unitigs and HiSeq reads (75-150 bp)
        'sr+dazz-utg' => ['read-long', 'ccs-1', 'dazzler-utg', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
        # Recommended with unitigs MiSeq reads (150-600 bp), 454, sanger, pacbio consensus reads ...
        'mr+dazz-utg' => ['read-long', 'ccs-1', 'dazzler-utg', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
	# Use if data aren't PacBio subreads
	'sr-noccs' => ['read-long', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
	# Use if data aren't PacBio subreads
	'mr-noccs' => ['read-long', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
        # Recommended with unitigs:
        'sr+utg-noccs' => ['read-long', 'blasr-utg', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
        # Recommended with unitigs:
        'mr+utg-noccs' => ['read-long', 'blasr-utg', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
        # Recommended with unitigs:
        'sr+dazz-utg-noccs' => ['read-long', 'dazzler-utg', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
        # Recommended with unitigs:
        'mr+dazz-utg-noccs' => ['read-long', 'dazzler-utg', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
        # use with externally created SAM/BAM file
        'sam' => ['read-long', 'read-sam'],
        'bam' => ['read-long', 'read-bam'],
	# use with unitigs only
	'utg' => ['read-long', 'ccs-1', 'blasr-utg'],
	'utg-noccs' => ['read-long', 'blasr-utg'],
	'dazz-utg' => ['read-long', 'ccs-1', 'dazzler-utg'],
	'dazz-utg-noccs' => ['read-long', 'dazzler-utg'],
        
        # Legacy mode, similar to version used in 2014 publication
	'legacy' => ['read-long', 'shrimp-pre-1', 'shrimp-pre-2', 'shrimp-pre-3', 'shrimp-finish'],
        # custom => ['my-pass-settings, finish],  #...
},

##-- Chimera filter ----------------------------------------------------------##
'chimera-filter' => {
	'--min-score' => 0.2,
	'--trim-length' => 20,
	'--verbose' => 2
},

##-- SeqFilter settings ------------------------------------------------------##
'seq-filter' => {
	'--trim-win' =>  "12,5",  # mean-min, abs-min
	'--min-length' => 500,
},

##-- siameara settings -------------------------------------------------------##
## 'simaera' => undef, # to deactivate
'siamaera' => {
},

## Long read qv-offset, required if --sam and --long are used together, and it
##  cannot be detected automatically from --long file.
'lr-qv-offset' => undef,

## Long read min length
'lr-min-length' => undef, # undef => 2 x short read length

## Short read quality offset, usually 64 or 33, use 0 for FASTA. Defaults to
##  guessing, Specify value if guessing fails. Needs to be the same for all
##  short read files provided.
'sr-qv-offset' => undef,

## Short read length. Defaults to guessing, sampling 1000 reads from input
##  file. Specify value if guessing fails.
'sr-length' => undef,

## Number of short reads provided, used for ETA calculation. Defaults to
##  guessing based on 1000 randomly sampled reads. Specify value if guessing
##  fails.
'sr-count' => undef,

## Toggle short reads head/tail trimming including leading/trailing indels
##  sr-indel-taboo-length
'sr-trim' => 1,

## target sr coverage for iterations
'sr-coverage' => {
              DEF => 15,
              'bwa-sr-finish' => 30,
              'bwa-mr-finish' => 30,
},

## SeqChunker sampling defaults
'sr-chunk-number' => 1000,
'sr-chunk-step' => 20,

## Trim reads to prevent insertions/deletions within the first
## 'sr-indel-taboo-length' bp / 'sr-indel-taboo fraction of the read. N=0
## deactivates the feature. length superceeds relative settings.
'sr-indel-taboo-length' => 7,
'sr-indel-taboo' => 0.1,

## Detect and identify chimera like looking reads
'detect-chimera' => {
	DEF => 0,
	'shrimp-finish' => 1,
	'bwa-sr-finish' => 1,
	'bwa-mr-finish' => 1,
	'read-sam' => 1, 
},

## annotate and mask reads with less than x sr-reads mappings and no hcr 
##  from prev. weakly supported reads often are contaminations - at least 
##  in genomic data
'mask-weak-reads' => {
	DEF => 0, # masking
	#'shrimp-pre-1' => 10, # activate on contaminated genome data 
	#'shrimp-pre-2' => 10, # activate on contaminated genome data 
},

## annotate, reads with less than x sr-reads, but pass them through
##  unprocessed - use this to get unmasked, but annotated low_support reads 
##  in final iterations
'ignore-weak-reads' => {
	DEF => 0, #  masking
        # 'shrimp-final ' => 20, # activate on contaminated genome data
},

## hcr-mask parameter for SeqFilter --phred-mask
## phred-min,phred-max,mask-min-len,unmask-min-len,mask-reduce,mask-end-ratio
## set mask/unmask-min-len assuming 100bp short reads, the values will
## be dynamically adjusted to the effective short read length
'hcr-mask' => {
        DEF => '20,41,80,130,60,0.7',
          'bwa-mr-4' => '20,41,80,130,60,0.3',
          'bwa-mr-5' => '20,41,80,130,60,0.3',
          'bwa-mr-6' => '20,41,80,130,60,0.3',
          'bwa-sr-4' => '20,41,80,130,60,0.3',
          'bwa-sr-5' => '20,41,80,130,60,0.3',
          'bwa-sr-6' => '20,41,80,130,60,0.3',
},

## If after a regular iteration more than this fraction are masked, skip queued
## iterations and directly proceed with "*-finish" iteration.
'mask-shortcut-frac' => 0.92,

## Minimum gain in an iteration to add anothor one
'mask-min-gain-frac' => 0.03,

## Number of reads to check out at once for individual consensus correction 
##  process. Memory intensive step, be cautios with great values
'chunk-size' => 100,

## Scale max coverage by this factor - prevents errors in low coverage regions
'coverage-scale-factor' => 0.75,

## Size in base pairs of bins for local score comparisons
'bin-size' => {
           DEF => 20,
           'sr' => 20,
           'sr+utg' => 20,
           'sr-noccs' => 20,
           'sr+utg-noccs' => 20,
           'mr' => 50,
           'mr+utg' => 50,
           'mr-noccs' => 50,
           'mr+utg-noccs' => 50,
           'sam' => 20,
           'bam' => 20,
           'utg' => 20, # not used, see utg-bin-size for utg mapping 
           'legacy' => 20,
},

## regions with this coverage are considered repetitive regions. Alignments
## located to ~ >80% within repetitive regions will be filtered.
'rep-coverage' => {
    DEF => undef,
    'blasr-utg' => 7,
    'dazzler-utg' => 7,
},

## ncscore is the relative score (score/length) of an alignment. Short
## alignments are additionally penalized by a correction factor accounting for
## greater uncertainty. Alignments scoring below min-nscore are filtered.
'min-ncscore' => {
    DEF => undef,
    'dazzler-utg' => 3.7,
    'blasr-utg' => 3.3,
},


## Size in base pairs of bins for local score comparisons
'utg-bin-size' => 150,

## target coverage for bins
'utg-bin-coverage' => 1,

## Maximum allowed insert length in alignment - long insert can be artefacts
'max-ins-length' => {
        DEF => 0,
},


##-- mapper settings ---------------------------------------------------------##
##-- sr ----------------------------------------------------------------------##
'shrimp-sr-1' => {
               qw(-h 45% --report 200 -w 150% -r 40% --match 5 --mismatch -11
                     --open-r -2 --open-q -1 --ext-r -4 --ext-q -3),
               '-s' => 1x10,
               '--no-mapping-qualities' => '',
},
## go strict first
##'bwa-sr-1' => {
##               '-a' => '',
##               '-Y' => '',
##               qw(-k 12 -W 20 -w 40 -r 1.5 -D .75 -y 20 -A 5 -B 11 -O 2,1 -E 4,3 -T 3 -L 30,30)
##              },
## go sensitive first
'bwa-sr' => { # bwa-12a
    '-a' => '',
    '-Y' => '',
    qw(-A 5 -B 11 -O 2,1 -E 4,3), # pacbio scoring scheme
    '-T' => 2.5, # per-base-score !!
    qw(-k 12 -W 20 -w 40 -r 1 -D 0 -y 20 -L 30,30)
},
'bwa-sr-finish' => {
               '-a' => '',
               '-Y' => '',
               #qw(-k 17 -W 18 -w 40 -r 1 -D 0 -y 20 -A 5 -B 11 -O 2,1 -E 4,3 -T 3.5 -L 30,30)
               #qw(-k 17 -W 18 -w 30 -r 1.5 -D .75 -A 5 -B 13 -O 15,19 -E 3,3 -T 4 -L 30,30)
               qw(-k 17 -W 18 -w 30 -r 1.5 -D .75 -A 5 -B 13 -O 15,19 -E 3,3 -T 4 -L 30,30)
               },

##-- mr ----------------------------------------------------------------------##
'shrimp-mr-1' => {
               qw(-h 55% --report 100 -w 150% -r 50% --match 5 --mismatch -11
                     --open-r -2 --open-q -1 --ext-r -4 --ext-q -3),
               '-s' => 1x12,
               '--no-mapping-qualities' => '',
},
## go sensitive first
'bwa-mr-1' => { # bwa-12a
    '-a' => '',
    '-Y' => '',
    qw(-A 5 -B 11 -O 2,1 -E 4,3), # pacbio scoring scheme
    '-T' => 2.5, # per-base-score !!
    qw(-k 12 -W 20 -w 40 -r 1 -D 0 -y 20 -L 30,30)
},
## go strict first
##'bwa-mr-1' => {
##              '-a' => '',
##              '-Y' => '',
##              qw(-k 13 -W 30 -w 40 -r 1.5 -D .75 -y 10 -A 5 -B 11 -O 2,1 -E 4,3 -T 3.5 -L 30,30)
##             },
'bwa-mr' => {
              '-a' => '',
              '-Y' => '',
              qw(-k 13 -W 20 -w 40 -r 1 -D .5 -y 20 -A 5 -B 11 -O 2,1 -E 4,3 -T 3 -L 30,30)
             },
'bwa-mr-finish' => {
              '-a' => '',
              '-Y' => '',
              qw(-k 19 -W 40 -w 30 -A 5 -B 13 -O 15,19 -E 3,3 -T 4 -L 30,30)
              },

##-- utg ---------------------------------------------------------------------##
'bwa-utg' => { # doesn't work to well, blasr is much better
               '-a' => '',
               '-Y' => '',
               qw(-k 14 -w 500 -W 40 -r 10 -A 5 -B 11 -O 2,1 -E 4,3 -L 0,0)
              },
'blasr-utg' => {
                    '-bestn' => 100,
                    '-nCandidates' => 100,
                    '-affineAlign' => '',
                    '-minMatch' => 17,
                    '-aggressiveIntervalCut' => '',
                    '-noSplitSubreads' => '',
             },

              
##-----------------------------------------------------------------##
## Legacy
## shrimp pre 1
'shrimp-pre-1' => {
	'-h' => "55%",
	'--report' => 200,
	'-s' => "1"x11,
	'-w' => "130%",
	'--no-mapping-qualities' => '',
	'--match' => 5,
	'--mismatch' => -11,
	'--open-r' => -2,
	'--open-q' => -1,
	'--ext-r' => -4,
	'--ext-q' => -3,
},

## shrimp pre 2
'shrimp-pre-2' => {
	'-h' => "55%",
	'--report' => 200,
	'-s' => "1"x10,
	'-w' => "140%",
	'-r' => "45%",
	'--no-mapping-qualities' => '',
	'--match' => 5,
	'--mismatch' => -11,
	'--open-r' => -2,
	'--open-q' => -1,
	'--ext-r' => -4,
	'--ext-q' => -3,
},

## shrimp pre 3
'shrimp-pre-3' => {
	'-h' => "50%",
	'--report' => 200,
	'-s' => "11111111,1111110000111111",
	'-w' => "140%",
	'-r' => "35%",
	'--no-mapping-qualities' => '',
	'--match' => 5,
	'--mismatch' => -11,
	'--open-r' => -2,
	'--open-q' => -1,
	'--ext-r' => -4,
	'--ext-q' => -3,
},

## shrimp pre 4
'shrimp-pre-4' => {
	'-h' => "35%",
	'--report' => 200,
	'-s' => ("1"x7).",111101111",
	'-w' => "150%",
	'-r' => "25%",
	'--no-mapping-qualities' => '',
	'--match' => 5,
	'--mismatch' => -11,
	'--open-r' => -2,
	'--open-q' => -1,
	'--ext-r' => -4,
	'--ext-q' => -3,
},

## shrimp finish
'shrimp-finish' => {
	'-h' => "90%",
	'--report' => 200,
	'-s' => "1"x20,
	'--hash-spaced-kmers' => '',
	'--match' => 5,
	'--mismatch' => -10,
	'--open-r' => -5,
	'--open-q' => -5,
	'--ext-r' => -2,
	'--ext-q' => -2,
},