-
Notifications
You must be signed in to change notification settings - Fork 7
/
create_reference_tsv.py
executable file
·47 lines (33 loc) · 1.25 KB
/
create_reference_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python
import os
import sys
import re
inFile = sys.argv[1]
outFile = sys.argv[2]
sampleDict = {}
with open(inFile, 'r') as fileHandler:
for line in fileHandler:
line = line.strip()
fqpattern = r'^(.*)_(S\d+)_L(\d+)_(R\d)_001.*$'
fqbasenm = re.match(fqpattern, os.path.basename(line))
if fqbasenm:
sample = fqbasenm.group(1)
ssheet_idx = fqbasenm.group(2)
laneno = int(fqbasenm.group(3))
readnr = fqbasenm.group(4)
else:
continue
fcpattern = r'^.*_[AB]?([^_]+)$'
m = re.match(fcpattern, os.path.basename(os.path.dirname(line)))
fcid = m.group(1) if m else "NA"
readgrp = f"{fcid}.{laneno}.{ssheet_idx}" # PU, Plattform unit. Will be used as read tag in BAM-files.
if readgrp not in sampleDict:
sampleDict[readgrp] = [sample]
sampleDict[readgrp].append(line)
with open(outFile, 'w') as outf:
for readgrp, sample_files in sampleDict.items():
samplenm = sample_files[0]
fqfiles = sample_files[1:]
#Write to .tsv: subject sex status sample PU fastq1 fastq2
entry = "\t".join([samplenm, "ZZ", "0", samplenm, readgrp] + fqfiles)
outf.write(f"{entry}\n")