-
Notifications
You must be signed in to change notification settings - Fork 0
/
import-crs.nf
executable file
·98 lines (77 loc) · 2.36 KB
/
import-crs.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env nextflow
process fetch_crs_bed {
input:
val(remote) from Channel.from(params.crs.path)
output:
file('*.bed') into raw_crs mode flatten
"""
cp $remote/data/*.bed.gz .
gzip -d *.bed.gz
"""
}
raw_crs
.map { f ->
def assembly = file(f).name
.replace("cmf_extend.", "")
.replace(".fdr10.nonredundant.bed", "")
[ assembly, f ]
}
.into { crs_bed_with_assemblies; pre_fetch }
pre_fetch
.map { assembly, _ -> [assembly, params.crs.assembly_rnacentral_mapping[assembly]] }
.into { for_rfam_fetch; for_rnacentral_fetch }
for_rfam_fetch
.combine(Channel.fromPath("files/crs/fetch-rfam.sql"))
.set { rfam_to_fetch }
process fetch_rfam_locations {
maxForks 4
input:
set val(crs_assembly), val(assembly), file(query) from rfam_to_fetch
output:
set val(crs_assembly), file("rfam-${assembly}.bed") into rfam_coordinates
"""
psql -v ON_ERROR_STOP=1 -v "assembly_id=$crs_assembly" -f $query "$PGDATABASE" > result.json
rnac ftp-export coordinates as-bed result.json result.bed
bedtools sort -i result.bed > rfam-${assembly}.bed
"""
}
for_rnacentral_fetch
.combine(Channel.fromPath("files/crs/fetch-rnacentral.sql"))
.set { rnacentral_assemblies_to_fetch }
process fetch_rnacentral_bed {
maxForks 4
input:
set val(crs_assembly), val(assembly), file(query) from rnacentral_assemblies_to_fetch
output:
set val(crs_assembly), file("rnacentral-${assembly}.bed") into rnacentral_locations
"""
psql -v ON_ERROR_STOP=1 -v "assembly_id=$assembly" -f $query "$PGDATABASE" > result.json
rnac ftp-export coordinates as-bed result.json > result.bed
bedtools sort -i result.bed > rnacentral-${assembly}.bed
"""
}
crs_bed_with_assemblies
.join(rfam_coordinates)
.join(rnacentral_locations)
.set { crs_to_clean }
process find_rnacentral_crs_features {
input:
set val(assembly), file(crs), file(rfam), file(rnacentral) from crs_to_clean
output:
file('complete_features.csv') into processed_crs
script:
def must_clean = params.crs.must_clean_bed.contains(assembly) ? '1' : '0';
"""
crs-overlaps $crs $rfam $rnacentral $must_clean
rnac crs selected_crs.tsv complete_features.csv
"""
}
process import_crs {
input:
file('complete_features*.csv') from processed_crs.collect()
file(ctl) from Channel.fromPath('files/crs/load.ctl')
"""
cp $ctl crs.ctl
pgloader --on-error-stop crs.ctl
"""
}