-
Notifications
You must be signed in to change notification settings - Fork 2
/
generate_synthetic_data.py
65 lines (51 loc) · 3.58 KB
/
generate_synthetic_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/python
import time
import threaded
from gff import *
from generate_sequence import *
from write_fastq import *
from sequence_calculation import *
def generate_synthetic_data(fa_file,total_no_seq,std_seq_percent,seed_error_percent,xseed_error_percent,
both_error_percent,depth,ascii_base,out,out_file_name,ground_truth_filename,
gff_file,mir_type,out_file_type,repeat,distribution,seed,adaptor,no_mismatch_seed ,
no_mismatch_xseed,parallel_thread):
file_read = open(fa_file).read().split('\n')
file_read = file_read[:-1]
rna_dict = {}
for line in file_read:
if ">" in line:
line_no = file_read.index(line)
rna_dict[line[1:]] = file_read[line_no+1]
fasta_seq = []
# for pure sequences only
gff_df = gff(gff_file,mir_type)
no_mir_chr,n_seq_per_chr = sequence_calculation(total_no_seq, std_seq_percent, depth,seed,gff_df)
fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'None', out, ground_truth_filename,'write',repeat,distribution, no_mismatch_seed , no_mismatch_xseed, seed)
print('****Pure Sequence are generated.****')
# for seed region error sequences only
if not repeat:
gff_df = update_gff(gff_df,out,ground_truth_filename) # Update gff dataframe so that there is no repitition.
no_mir_chr,n_seq_per_chr = sequence_calculation(total_no_seq, seed_error_percent, depth,seed,gff_df)
fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'Seed_region', out, ground_truth_filename,'append',repeat,distribution, no_mismatch_seed , no_mismatch_xseed, seed)
print('****Sequence with error in seed region are generated.****')
# for outside seed region error sequences only
if not repeat:
gff_df = update_gff(gff_df,out,ground_truth_filename)
no_mir_chr,n_seq_per_chr = sequence_calculation(total_no_seq, xseed_error_percent, depth,seed,gff_df)
fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'Outside_Seed_region', out, ground_truth_filename,'append',repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed)
print('****Sequence with error in xseed region are generated.****')
# for both seed region error and outside seed region error sequences
if not repeat:
gff_df = update_gff(gff_df,out,ground_truth_filename)
no_mir_chr, n_seq_per_chr = sequence_calculation(total_no_seq, both_error_percent, depth,seed,gff_df)
fasta_seq = generate_sequence(fasta_seq,gff_df, rna_dict, no_mir_chr, n_seq_per_chr, depth, 'Both_region', out, ground_truth_filename,'append',repeat,distribution, no_mismatch_seed , no_mismatch_xseed,seed)
print('****Sequence with error in both seed and xseed region are generated.****')
write_fastq(fasta_seq, adaptor, ascii_base, out, out_file_name,out_file_type,seed,parallel_thread)
# merge similar synthetic sequences
ground_truth_df = pd.read_csv(os.path.join(out,ground_truth_filename),sep='\t')
ground_truth_df = ground_truth_df.groupby(['RNA_ID','Impure_Region','Cigar_String','ref_Sequence','synthetic_sequence','chr','chr_start','chr_end']).sum().reset_index()
ground_truth_df = ground_truth_df.sort_values('Impure_Region')
ground_truth_df.to_csv(os.path.join(out,ground_truth_filename),encoding='utf-8',index=False)
# deleting temporary folder
cmd = 'rm -rf '+ os.path.join(out,'temp','..?* .[!.]*') + ' && rm -rf '+ os.path.join(out,'temp') + ' 2> /dev/null'
os.system(cmd)