-
Notifications
You must be signed in to change notification settings - Fork 20
/
proovread.cfg
464 lines (399 loc) · 16 KB
/
proovread.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
## proovread Config File
##-- style -------------------------------------------------------------------##
## This file can be digested by proovread when provided with -c/--cfg. Options
## supersede proovread default values but will be overwritten by command line
## supplied values. The content is directly evaluated as perl code (list in
## hash context), therefore proper syntax is crutial:
## * "#" starts a comment, which is entirely ignored by the program, on
## creation, every parameter is commented out, remove the "#" at the beginning
## of the key => value line if you want your modifications to have effect
## * Keys/Strings need to be enclosed '' or ""
## * Each key requires a value
## * undef (without ''/"") is a perl keyword, use it if value is unknown/ not
## available
## * each key => value pair has to be followed by a ","
## * white space has no effect but better readability
## * since parameter are evaled, you can use code snippets to determine
## parameter dynamically, e.g. to auto-detect the maximum number of threads
## available replace: 'threads' => 8, by 'threads' => qx(grep '^processor'
## /proc/cpuinfo | wc -l) =~ s/\n//r, or use perl glob(*.fq) to auto-detect
## input files...
## * if you want to create a customized, slim version of this config, the only
## requirement is that it forms a valid perl list of alternating keys and
## values with an even number of arguments total. E.g. these one liners would
## make equivalent, complete and proper configs:
##
## 'long-reads',"LR.fa",'short-reads',"SR.fq"
## or
## 'long-reads' => "LR.fa", 'short-reads' => "SR.fq"
##
## * Constrasting to the command line behaviour, keys cannot be abbreviated
##-- command line parameter --------------------------------------------------##
## LIST of Pacbio read files to correct. FASTA or FASTQ format.
'long-reads' => [],
## LIST of high confidence short read files used for correction in FASTQ or
## FASTA format.
'short-reads' => [],
## Prefix to output files. Defaults to 'proovread'
'prefix' => undef,
## Coverage cutoff for highest scoring mappings at each location.
'coverage' => 50,
## Number of threads to use for mapping. Defaults to 8 (or maximum available
## number of processors, if kess than 8 availabe).
'threads' => 4,
## to auto-detect max processors use:
## 'threads' => qx(grep '^processor' /proc/cpuinfo | wc -l) =~ s/\n//r,
##
'mode' => 'auto', # for available values see 'mode-tasks' below
## Use already created mapping in SAM/BAM format to create corrected consensus
## sequences from. Use this as alternative input to short-reads. If --sam/--bam
## is specified, --mode is automatically set to sam/bam.
'sam' => undef,
'bam' => undef,
## Sort the filtered SAM files by coordinates in addition to the sorting of
## references. This has no effect on the pipeline, and is just a convenience if
## you need the files for something else.
'sort-sam-by-coordinates' => undef,
## Specify '1' to keep temporary file of each pass, '2' to also keep the
## individual temporary file of each thread.
'keep-temporary-files' => 0, # 0,1,2
## overwrite exiting output folder
'overwrite' => 0,
## shrimp2 (gmapper-ls) path, set '' to look in PATH
## dont forget the trailing /
'shrimp-path' => $RealBin.'/../util/shrimp-2.2.3/',
## bwa path, set '' to look in PATH
## dont forget the trailing /
'bwa-path' => $RealBin.'/../util/bwa/',
## blasr path, set '' to look in PATH
## dont forget the trailing /
'blasr-path' => $RealBin.'/../util/blasr-1.3.1/',
## dazzlrt paths, set '' to look in PATH
## dont forget the trailing /
'daligner-path' => '',
'dazz-db-path' => '',
## samtools path, assume exported
'samtools-path' => '',
## blast path, assume exported
'blast-path' => '',
##-- advanced parameter --------------------------------------------------##
## don't mess with these unless you know what you are doing
##-- task settings -------------------------------------------------------##
'mode-tasks' => {
# Recommended for HiSeq reads (75-150 bp)
'sr' => ['read-long', 'ccs-1', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
# Recommended for MiSeq reads (150-600 bp), 454, sanger, pacbio consensus reads ...
'mr' => ['read-long', 'ccs-1', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
# Recommended with unitigs and HiSeq reads (75-150 bp)
'sr+utg' => ['read-long', 'ccs-1', 'blasr-utg', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
# Recommended with unitigs MiSeq reads (150-600 bp), 454, sanger, pacbio consensus reads ...
'mr+utg' => ['read-long', 'ccs-1', 'blasr-utg', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
# Recommended with unitigs and HiSeq reads (75-150 bp)
'sr+dazz-utg' => ['read-long', 'ccs-1', 'dazzler-utg', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
# Recommended with unitigs MiSeq reads (150-600 bp), 454, sanger, pacbio consensus reads ...
'mr+dazz-utg' => ['read-long', 'ccs-1', 'dazzler-utg', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
# Use if data aren't PacBio subreads
'sr-noccs' => ['read-long', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
# Use if data aren't PacBio subreads
'mr-noccs' => ['read-long', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
# Recommended with unitigs:
'sr+utg-noccs' => ['read-long', 'blasr-utg', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
# Recommended with unitigs:
'mr+utg-noccs' => ['read-long', 'blasr-utg', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
# Recommended with unitigs:
'sr+dazz-utg-noccs' => ['read-long', 'dazzler-utg', 'bwa-sr-1', 'bwa-sr-2', 'bwa-sr-3', 'bwa-sr-4', 'bwa-sr-5', 'bwa-sr-6', 'bwa-sr-finish'],
# Recommended with unitigs:
'mr+dazz-utg-noccs' => ['read-long', 'dazzler-utg', 'bwa-mr-1', 'bwa-mr-2', 'bwa-mr-3', 'bwa-mr-4', 'bwa-mr-5', 'bwa-mr-6', 'bwa-mr-finish'],
# use with externally created SAM/BAM file
'sam' => ['read-long', 'read-sam'],
'bam' => ['read-long', 'read-bam'],
# use with unitigs only
'utg' => ['read-long', 'ccs-1', 'blasr-utg'],
'utg-noccs' => ['read-long', 'blasr-utg'],
'dazz-utg' => ['read-long', 'ccs-1', 'dazzler-utg'],
'dazz-utg-noccs' => ['read-long', 'dazzler-utg'],
# Legacy mode, similar to version used in 2014 publication
'legacy' => ['read-long', 'shrimp-pre-1', 'shrimp-pre-2', 'shrimp-pre-3', 'shrimp-finish'],
# custom => ['my-pass-settings, finish], #...
},
##-- Chimera filter ----------------------------------------------------------##
'chimera-filter' => {
'--min-score' => 0.2,
'--trim-length' => 20,
'--verbose' => 2
},
##-- SeqFilter settings ------------------------------------------------------##
'seq-filter' => {
'--trim-win' => "12,5", # mean-min, abs-min
'--min-length' => 500,
},
##-- siameara settings -------------------------------------------------------##
## 'simaera' => undef, # to deactivate
'siamaera' => {
},
## Long read qv-offset, required if --sam and --long are used together, and it
## cannot be detected automatically from --long file.
'lr-qv-offset' => undef,
## Long read min length
'lr-min-length' => undef, # undef => 2 x short read length
## Short read quality offset, usually 64 or 33, use 0 for FASTA. Defaults to
## guessing, Specify value if guessing fails. Needs to be the same for all
## short read files provided.
'sr-qv-offset' => undef,
## Short read length. Defaults to guessing, sampling 1000 reads from input
## file. Specify value if guessing fails.
'sr-length' => undef,
## Number of short reads provided, used for ETA calculation. Defaults to
## guessing based on 1000 randomly sampled reads. Specify value if guessing
## fails.
'sr-count' => undef,
## Toggle short reads head/tail trimming including leading/trailing indels
## sr-indel-taboo-length
'sr-trim' => 1,
## target sr coverage for iterations
'sr-coverage' => {
DEF => 15,
'bwa-sr-finish' => 30,
'bwa-mr-finish' => 30,
},
## SeqChunker sampling defaults
'sr-chunk-number' => 1000,
'sr-chunk-step' => 20,
## Trim reads to prevent insertions/deletions within the first
## 'sr-indel-taboo-length' bp / 'sr-indel-taboo fraction of the read. N=0
## deactivates the feature. length superceeds relative settings.
'sr-indel-taboo-length' => 7,
'sr-indel-taboo' => 0.1,
## Detect and identify chimera like looking reads
'detect-chimera' => {
DEF => 0,
'shrimp-finish' => 1,
'bwa-sr-finish' => 1,
'bwa-mr-finish' => 1,
'read-sam' => 1,
},
## annotate and mask reads with less than x sr-reads mappings and no hcr
## from prev. weakly supported reads often are contaminations - at least
## in genomic data
'mask-weak-reads' => {
DEF => 0, # masking
#'shrimp-pre-1' => 10, # activate on contaminated genome data
#'shrimp-pre-2' => 10, # activate on contaminated genome data
},
## annotate, reads with less than x sr-reads, but pass them through
## unprocessed - use this to get unmasked, but annotated low_support reads
## in final iterations
'ignore-weak-reads' => {
DEF => 0, # masking
# 'shrimp-final ' => 20, # activate on contaminated genome data
},
## hcr-mask parameter for SeqFilter --phred-mask
## phred-min,phred-max,mask-min-len,unmask-min-len,mask-reduce,mask-end-ratio
## set mask/unmask-min-len assuming 100bp short reads, the values will
## be dynamically adjusted to the effective short read length
'hcr-mask' => {
DEF => '20,41,80,130,60,0.7',
'bwa-mr-4' => '20,41,80,130,60,0.3',
'bwa-mr-5' => '20,41,80,130,60,0.3',
'bwa-mr-6' => '20,41,80,130,60,0.3',
'bwa-sr-4' => '20,41,80,130,60,0.3',
'bwa-sr-5' => '20,41,80,130,60,0.3',
'bwa-sr-6' => '20,41,80,130,60,0.3',
},
## If after a regular iteration more than this fraction are masked, skip queued
## iterations and directly proceed with "*-finish" iteration.
'mask-shortcut-frac' => 0.92,
## Minimum gain in an iteration to add anothor one
'mask-min-gain-frac' => 0.03,
## Number of reads to check out at once for individual consensus correction
## process. Memory intensive step, be cautios with great values
'chunk-size' => 100,
## Scale max coverage by this factor - prevents errors in low coverage regions
'coverage-scale-factor' => 0.75,
## Size in base pairs of bins for local score comparisons
'bin-size' => {
DEF => 20,
'sr' => 20,
'sr+utg' => 20,
'sr-noccs' => 20,
'sr+utg-noccs' => 20,
'mr' => 50,
'mr+utg' => 50,
'mr-noccs' => 50,
'mr+utg-noccs' => 50,
'sam' => 20,
'bam' => 20,
'utg' => 20, # not used, see utg-bin-size for utg mapping
'legacy' => 20,
},
## regions with this coverage are considered repetitive regions. Alignments
## located to ~ >80% within repetitive regions will be filtered.
'rep-coverage' => {
DEF => undef,
'blasr-utg' => 7,
'dazzler-utg' => 7,
},
## ncscore is the relative score (score/length) of an alignment. Short
## alignments are additionally penalized by a correction factor accounting for
## greater uncertainty. Alignments scoring below min-nscore are filtered.
'min-ncscore' => {
DEF => undef,
'dazzler-utg' => 3.7,
'blasr-utg' => 3.3,
},
## Size in base pairs of bins for local score comparisons
'utg-bin-size' => 150,
## target coverage for bins
'utg-bin-coverage' => 1,
## Maximum allowed insert length in alignment - long insert can be artefacts
'max-ins-length' => {
DEF => 0,
},
##-- mapper settings ---------------------------------------------------------##
##-- sr ----------------------------------------------------------------------##
'shrimp-sr-1' => {
qw(-h 45% --report 200 -w 150% -r 40% --match 5 --mismatch -11
--open-r -2 --open-q -1 --ext-r -4 --ext-q -3),
'-s' => 1x10,
'--no-mapping-qualities' => '',
},
## go strict first
##'bwa-sr-1' => {
## '-a' => '',
## '-Y' => '',
## qw(-k 12 -W 20 -w 40 -r 1.5 -D .75 -y 20 -A 5 -B 11 -O 2,1 -E 4,3 -T 3 -L 30,30)
## },
## go sensitive first
'bwa-sr' => { # bwa-12a
'-a' => '',
'-Y' => '',
qw(-A 5 -B 11 -O 2,1 -E 4,3), # pacbio scoring scheme
'-T' => 2.5, # per-base-score !!
qw(-k 12 -W 20 -w 40 -r 1 -D 0 -y 20 -L 30,30)
},
'bwa-sr-finish' => {
'-a' => '',
'-Y' => '',
#qw(-k 17 -W 18 -w 40 -r 1 -D 0 -y 20 -A 5 -B 11 -O 2,1 -E 4,3 -T 3.5 -L 30,30)
#qw(-k 17 -W 18 -w 30 -r 1.5 -D .75 -A 5 -B 13 -O 15,19 -E 3,3 -T 4 -L 30,30)
qw(-k 17 -W 18 -w 30 -r 1.5 -D .75 -A 5 -B 13 -O 15,19 -E 3,3 -T 4 -L 30,30)
},
##-- mr ----------------------------------------------------------------------##
'shrimp-mr-1' => {
qw(-h 55% --report 100 -w 150% -r 50% --match 5 --mismatch -11
--open-r -2 --open-q -1 --ext-r -4 --ext-q -3),
'-s' => 1x12,
'--no-mapping-qualities' => '',
},
## go sensitive first
'bwa-mr-1' => { # bwa-12a
'-a' => '',
'-Y' => '',
qw(-A 5 -B 11 -O 2,1 -E 4,3), # pacbio scoring scheme
'-T' => 2.5, # per-base-score !!
qw(-k 12 -W 20 -w 40 -r 1 -D 0 -y 20 -L 30,30)
},
## go strict first
##'bwa-mr-1' => {
## '-a' => '',
## '-Y' => '',
## qw(-k 13 -W 30 -w 40 -r 1.5 -D .75 -y 10 -A 5 -B 11 -O 2,1 -E 4,3 -T 3.5 -L 30,30)
## },
'bwa-mr' => {
'-a' => '',
'-Y' => '',
qw(-k 13 -W 20 -w 40 -r 1 -D .5 -y 20 -A 5 -B 11 -O 2,1 -E 4,3 -T 3 -L 30,30)
},
'bwa-mr-finish' => {
'-a' => '',
'-Y' => '',
qw(-k 19 -W 40 -w 30 -A 5 -B 13 -O 15,19 -E 3,3 -T 4 -L 30,30)
},
##-- utg ---------------------------------------------------------------------##
'bwa-utg' => { # doesn't work to well, blasr is much better
'-a' => '',
'-Y' => '',
qw(-k 14 -w 500 -W 40 -r 10 -A 5 -B 11 -O 2,1 -E 4,3 -L 0,0)
},
'blasr-utg' => {
'-bestn' => 100,
'-nCandidates' => 100,
'-affineAlign' => '',
'-minMatch' => 17,
'-aggressiveIntervalCut' => '',
'-noSplitSubreads' => '',
},
##-----------------------------------------------------------------##
## Legacy
## shrimp pre 1
'shrimp-pre-1' => {
'-h' => "55%",
'--report' => 200,
'-s' => "1"x11,
'-w' => "130%",
'--no-mapping-qualities' => '',
'--match' => 5,
'--mismatch' => -11,
'--open-r' => -2,
'--open-q' => -1,
'--ext-r' => -4,
'--ext-q' => -3,
},
## shrimp pre 2
'shrimp-pre-2' => {
'-h' => "55%",
'--report' => 200,
'-s' => "1"x10,
'-w' => "140%",
'-r' => "45%",
'--no-mapping-qualities' => '',
'--match' => 5,
'--mismatch' => -11,
'--open-r' => -2,
'--open-q' => -1,
'--ext-r' => -4,
'--ext-q' => -3,
},
## shrimp pre 3
'shrimp-pre-3' => {
'-h' => "50%",
'--report' => 200,
'-s' => "11111111,1111110000111111",
'-w' => "140%",
'-r' => "35%",
'--no-mapping-qualities' => '',
'--match' => 5,
'--mismatch' => -11,
'--open-r' => -2,
'--open-q' => -1,
'--ext-r' => -4,
'--ext-q' => -3,
},
## shrimp pre 4
'shrimp-pre-4' => {
'-h' => "35%",
'--report' => 200,
'-s' => ("1"x7).",111101111",
'-w' => "150%",
'-r' => "25%",
'--no-mapping-qualities' => '',
'--match' => 5,
'--mismatch' => -11,
'--open-r' => -2,
'--open-q' => -1,
'--ext-r' => -4,
'--ext-q' => -3,
},
## shrimp finish
'shrimp-finish' => {
'-h' => "90%",
'--report' => 200,
'-s' => "1"x20,
'--hash-spaced-kmers' => '',
'--match' => 5,
'--mismatch' => -10,
'--open-r' => -5,
'--open-q' => -5,
'--ext-r' => -2,
'--ext-q' => -2,
},