-
Notifications
You must be signed in to change notification settings - Fork 0
/
config.yaml
226 lines (177 loc) · 6.9 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# ======================== #
# bincs configuration file #
# ======================== #
# -------
# General
# -------
# Settings common to all the targets.
# Path to store the intermediate and final results. Will be created if it does
# not exist.
# (Required)
prefix: /mnt/scratch/projects/epipp
# If set, ensures that Snakemake won't start unless you use the tmux session
# manager.
# (Not required)
tmux: True
# -------------
# Sample Sheets
# -------------
# The sample sheet describes the sequence files to use, their type (ChIP/Input)
# and which experimental condition they belong to.
# Main sample sheet.
# (Required)
sample_sheet: sample_sheet_fixed.txt
# A sample sheet of external controls. For example the same ChIP used in a
# different species. Can be used to find non-specific binding by aligning these
# files to the genome used and seeing which bins get a lot of reads. These are
# candidates for blacklisting and tested according to a Poisson model.
# (Not required)
external_control_sample_sheet: sample_sheet_controls.txt
# -----------------------
# Genomes And Annotations
# -----------------------
# Which genomes and annotations to use
# Which genome to use. Must be the UCSC name. Mostly used to automatically fetch
# genome and chromosome sizes.
# (Required)
genome: hg38
# ----------------------
# Gene Overlap Barcharts
# ----------------------
# Settings for the gene overlap barcharts.
# Radius around TSS/TES that is considered a part of the TSS/TES. Eg. if 1000,
# then then TSS/TES is considered to be any area within 1000 nucleotides of it
# is considered to be the TSS.
# (Not required)
barchart_tss_length: 3000
# ---------
# Alignment
# ---------
# These flags and options are only of interest if your filetype is fastq.
# The prefix of the hisat2 genome index. Premade indexes can be found at
# ftp://ftp.ccb.jhu.edu/pub/infphilo/hisat2/data These indexes are used to align
# the fastqs to the genome. If the input is not fastq, these files are not used.
# (Not required)
hisat2_index_prefix: /mnt/cargo/genomes/hisat2/hg38/genome
# Additional flags to give to hisat2.
# (Not required)
hisat2_extra_flags: --no-spliced-alignment -k 1 --no-discordant --no-mixed
# If this flag is set, multi aligning reads from the alignment are removed (i.e.
# those with a NH:i field of 1.)
# (Not required)
keep_multi_aligning_reads: True
# A list of adapters to remove from your fastqs.
# (Not required)
adapters:
# The minimum length of your reads after adapter removal.
# (Not required)
min_read_length: 14
# ----------
# Sequencing
# ----------
# heyo
# Whether or not the reads are paired end.
# (Required)
paired_end: False
# Estimated size of the sequenced fragments.
# (Required)
fragment_length:
# A two-column headerless file of estimated sizes of the sequenced fragments per
# group. Col1 is the group names, col2 the frag sizes.
# (Required)
fragment_lengths: "private/epipp/frag_sizes.txt"
# Minimum required fastq quality.
# (Not required)
fastq_quality: 20
# -------------------------
# Heatmaps And Profileplots
# -------------------------
# Settings for heatmaps and profileplots.
# The gzipped gencode annotation file to use. It is used to find the regions to
# be plotted in heatmaps and profileplots. If the path starts with www/http/ftp
# it will be downloaded, otherwise the path will be interpreted to be local.
# This kind of high-quality annotation only exists for humans and mice. If your
# genome is any other, the appropriate regions will be downloaded from UCSC
# refGene for your genome.
# (Not required)
annotation_gff3: ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_27/gencode.v27.annotation.gff3.gz
# What way to sort the heatmaps. Only applicable if sort_order_group is not
# used. Options: keep, no, descend, ascend. Keep will keep the sort order of the
# regions and make all the groups have the same sort order. descend/ascend will
# sort each group according to the signal.
# (Not required)
sort_order: keep
# If given, all heatmaps are sorted according to the sort order for this group.
# This makes it easy to compare heatmaps between groups. If not given, each
# heatmap is sorted individually.
# (Not required)
sort_order_group:
# Only applicable for group_vs_group heatmaps! If given, all group vs group
# heatmaps are sorted according to the sorting for sort_order_group vs
# second_sort_order_group. This makes it easy to compare heatmaps between
# groups. If not given, each heatmap is sorted individually.
# (Not required)
second_sort_order_group:
# The distance to be shown upstream and downstream of any region with 'gene' in
# the name in the heatmaps and profileplots.
# (Not required)
tss_distance_gene: 3000
# The distance to be shown upstream and downstream of any region without 'gene'
# in the name in the heatmaps and profileplots.
# (Not required)
tss_distance_other: 500
# `regions` is a list of the regions to be plotted in the heatmaps and
# profileplots. The following regions are valid: ```'CDS', 'exon', #
# 'five_prime_UTR', 'gene', 'start_codon', 'stop_codon',
# 'stop_codon_redefined_as_selenocysteine', 'three_prime_UTR', 'transcript',
# 'internal_exon'``` Using it requires that you either have the configuration
# variable `remote_annotation_gff3` or `local_annotation_gff3` defined.
# (Not required)
regions: ['exon', 'gene', 'internal_exon', 'tss']
# `custom_regions` is a map between region names and bed-files used to define
# the custom regions. Custom region files can also be used to draw different
# lines and colors for different regions in the same plot. This can be done by
# adding a seventh column called deepTools_group to the file. If used, the file
# must be sorted on the column deepTools_group.
# (Not required)
custom_regions: None
# -----------------------
# Differential Enrichment
# -----------------------
# limma and stuff
# Contrasts to test for differential enrichment between groups. The names used
# must be the same as the column names in the design_matrix. By default each
# group in the design matrix is tested against the others in pairs. By default a
# design matrix with an intercept where each group in the sample sheet is a
# column in the design matrix. Command used is model.matrix(~0 + groups).
# (Not required)
contrasts: None
# The file used to describe the experimental setup.
# (Not required)
design_matrix: None
# ------------
# Peak Calling
# ------------
# peak calling settings
# The peak callers used to find peaks. Valid values are epic and macs2.
# (Not required)
peak_callers: ['epic']
# The epic gaps-allowed parameter.
# (Not required)
epic_gap: 3
# The epic window-size parameter.
# (Not required)
epic_window_size: 200
# The macs2 --broad parameter.
# (Not required)
macs2_broad:
# Any other flags you wish to pass to macs2
# (Not required)
macs2_extra_flags:
# The window size csaw should use.
# (Not required)
csaw_window_size: 200
# The minimum required count sum across libraries for a bin to be included in
# the analysis.
# (Not required)
csaw_filter: 1