-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_mr2_test_cases.py
53 lines (48 loc) · 1.52 KB
/
get_mr2_test_cases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#! /usr/bin/env python3
"""
get_mr2_test_cases.py
Creates fastq files with duplicated reads from a folder containing fastq files.
Author Loaiza, K.
Comments Created: September 11, 2021.
"""
# Libraries
import os
import numpy as np
from Bio import SeqIO
# Folders
in_dir = 'mr0_test_cases'
out_dir = 'mr2_test_cases'
# Create the new output folder
if not os.path.exists(out_dir):
os.mkdir(out_dir)
# Remove the old output folder
else:
os.remove(out_dir)
# Get fastq files
files_list = os.listdir(in_dir)
files_list = [file for file in files_list if file.endswith('.fastq')]
for file in files_list:
# Get filename without .fastq
# In1538_MF974580.fastq > In1538_MF974580
name = os.path.splitext(file)[0]
# Get filepath
# mr0_test_cases/In1538_MF974580.fastq
old_file = os.path.join(in_dir, file)
# Open the fastq file
for r in SeqIO.parse(old_file, 'fastq'):
# Get the lenght of the fastq sequence
P = len(r.seq)
# Get I random indices
I = 1
indices = np.sort(np.random.randint(P, size = I))
print(f'{r.id} size {P} split at {indices}')
# Get the chunks of sequences
s1 = r[:indices[0]]
s2 = r[indices[0]+1:]
s3 = r[:indices[0]]
s4 = r[indices[0]+1:]
records = [s1, s2, s3, s4]
# Get the new filepath
new_file = os.path.join(out_dir, name)
# Save the chunks of sequences in one fastq file
SeqIO.write(records, f'{new_file}.fastq', 'fastq')