From 4b69b7431e0b93e4d2d79ebee580abcbd58568b4 Mon Sep 17 00:00:00 2001 From: Jean-Francois Pombert Date: Wed, 8 May 2024 08:57:36 -0400 Subject: [PATCH] + minimum contig size option --- Examples/commands.conf | 3 ++ Plots/nucleotide_biases.pl | 17 ++++++- README.md | 1 + Utils/list_maker.pl | 95 +++++++++++++++++++++++++++++--------- run_syny.pl | 53 +++++++++++---------- 5 files changed, 120 insertions(+), 49 deletions(-) diff --git a/Examples/commands.conf b/Examples/commands.conf index 1ea8f56..cb381eb 100644 --- a/Examples/commands.conf +++ b/Examples/commands.conf @@ -23,6 +23,9 @@ ### Allowable number of gaps between gene pairs,e.g. --gaps 0, 1, 5 [Default = 0] --gaps 0 +### Minimum contig size (in bp) [Default: 1]; i.e. investigates all contigs +--minsize 1 + ### Specify minimap2 max divergence preset (--asm 5, 10 or 20) [Default: off] # --asm 20 diff --git a/Plots/nucleotide_biases.pl b/Plots/nucleotide_biases.pl index 7350e6c..97bf2f6 100755 --- a/Plots/nucleotide_biases.pl +++ b/Plots/nucleotide_biases.pl @@ -1,8 +1,8 @@ #!/usr/bin/perl ## Pombert Lab, 2022 my $name = 'nucleotide_biases.pl'; -my $version = '0.7c'; -my $updated = '2024-05-01'; +my $version = '0.7d'; +my $updated = '2024-05-08'; use strict; use warnings; @@ -25,6 +25,7 @@ -outdir output_directory \\ -winsize 1000 \\ -step 500 \\ + -minsize 1 \\ -reference CCMP1205 \\ -gap 0 \\ -custom_preset chloropicon @@ -34,6 +35,7 @@ -o (--outdir) Output directory [Default: ntBiases] -w (--winsize) Sliding window size [Default: 10000] -s (--step) Sliding window step [Default: 5000] +-m (--minsize) Minimum contig size (in bp) [Default: 1] -n (--ncheck) Check for ambiguous/masked (Nn) nucleotides -t (--tsv) Output tab-delimited files (e.g. for excel plotting) @@ -63,6 +65,7 @@ my @fasta; my $outdir = 'ntBiases'; +my $minsize = 1; my $winsize = 10000; my $step = 5000; my $reference; @@ -88,6 +91,7 @@ GetOptions( 'f|fasta=s@{1,}' => \@fasta, 'o|outdir=s' => \$outdir, + 'minsize=i' => \$minsize, 'w|winsize=i' => \$winsize, 's|step=i' => \$step, 'n|ncheck' => \$ncheck, @@ -163,6 +167,7 @@ open FASTA, "<", $fasta or die "Can't open $fasta: $!\n"; + ## Database of sequences while (my $line = ){ chomp $line; if ($line =~ /^>(\S+)/){ @@ -173,6 +178,14 @@ } } + ## Delete sequence if smaller than minsize + foreach my $seq (keys %{$sequences{$fileprefix}}){ + my $seq_len = length($sequences{$fileprefix}{$seq}); + if ($seq_len < $minsize){ + delete $sequences{$fileprefix}; + } + } + my $fasta_dir = $singledir.'/'.$fileprefix; unless (-d $fasta_dir){ mkdir ($fasta_dir,0755) or die "Can't create $fasta_dir: $!\n"; diff --git a/README.md b/README.md index 54e94cb..7796ec8 100644 --- a/README.md +++ b/README.md @@ -244,6 +244,7 @@ Options for run_SYNY.pl are: -o (--outdir) Output directory [Default = SYNY] -e (--evalue) DIAMOND BLASTP evalue cutoff [Default = 1e-10] -g (--gaps) Allowable number of gaps between gene pairs [Default = 0] +--minsize Minimum contig size (in bp) [Default: 1] --asm Specify minimap2 max divergence preset (--asm 5, 10 or 20) [Default: off] --resume Resume minimap2 computations (skip completed alignments) --no_map Skip minimap2 pairwise genome alignments diff --git a/Utils/list_maker.pl b/Utils/list_maker.pl index 5385ffe..d83f322 100755 --- a/Utils/list_maker.pl +++ b/Utils/list_maker.pl @@ -2,8 +2,8 @@ # Pombert lab, 2020 my $name = 'list_maker.pl'; -my $version = '0.5.4b'; -my $updated = '2024-04-24'; +my $version = '0.5.5'; +my $updated = '2024-05-08'; use strict; use warnings; @@ -29,6 +29,7 @@ OPTIONS: -i (--input) Input file(s) (GZIP supported; File type determined by file extension) -o (--outdir) Output directory [Default: LIST_MAKER] +-m (--minsize) Keep only contigs larger or equal to specified size (in bp) [Default: 1] -v (--verbose) Add verbosity REGEX die "$usage\n" unless @ARGV; @@ -42,10 +43,12 @@ my @input_files; my $outdir = 'LIST_MAKER'; +my $minsize = 1; my $verbose; GetOptions( 'i|input=s@{1,}' => \@input_files, 'o|outdir=s' => \$outdir, + 'm|minsize=i' => \$minsize, 'v|verbose' => \$verbose ); @@ -89,7 +92,7 @@ } if ($verbose){ - print "Creating .list file for $file_name\n"; + print "\nCreating .list file for $file_name\n"; } my $annotation_list = "$list_dir/$file_prefix.list"; @@ -112,6 +115,13 @@ my $seq_flag; my $contig; + my $contig_size; + my $contig_sum = 0; + my $contig_kept_counter = 0; + my $contig_kept_sum = 0; + my $contig_discarded_counter = 0; + my $contig_discarded_sum = 0; + my $gene; my $CDS; my $translate; @@ -129,13 +139,31 @@ chomp $line; - if ($line =~ /^LOCUS\s+(\S+)/ ){ + if ($line =~ /^LOCUS\s+(\S+).*?(\d+) bp/ ){ + $contig = $1; + $contig_size = $2; + $contig_sum += $contig_size; + + if ($verbose){ + if ($contig_size >= $minsize){ + $contig_kept_counter++; + $contig_kept_sum += $contig_size; + print ' '.$contig."\t".$contig_size."\t". ">= $minsize"."\t"."kept\n"; + } + else { + $contig_discarded_counter++; + $contig_discarded_sum += $contig_size; + print ' '.$contig."\t".$contig_size."\t"."< $minsize"."\t"."discarded\n"; + } + } + ## if the contig is not named with a unique LOCUS ## add the file_prefix in front to prevent naming clashes if ($contig =~ /chromosome|contig/i){ $contig = $file_prefix.'_'.$contig; } + } if ($line =~ /^ORIGIN/){ @@ -202,27 +230,31 @@ unless (exists $isoform{$locus}){ ## Keeping only the first isoform - $isoform{$locus} = ''; - print OUT $locus."\t"; - print OUT $contig."\t"; - print OUT $start."\t"; - print OUT $end."\t"; - print OUT $strand."\t"; - print OUT $gene_num."\t"; + if ($contig_size >= $minsize){ + $isoform{$locus} = ''; + print OUT $locus."\t"; + print OUT $contig."\t"; + print OUT $start."\t"; + print OUT $end."\t"; + print OUT $strand."\t"; + print OUT $gene_num."\t"; + + if (!defined $features{$locus}{'product'}){ + $features{$locus}{'product'} = 'undefined product in accession'; + } + print OUT $features{$locus}{'product'}."\n"; + + print PROT ">$locus \[$features{$locus}{'product'}\]\n"; #\t$contig\t$start\t$end\t$strand\n"; + foreach my $line (unpack("(A60)*",$sequence)){ + print PROT "$line\n"; + } - if (!defined $features{$locus}{'product'}){ - $features{$locus}{'product'} = 'undefined product in accession'; - } - print OUT $features{$locus}{'product'}."\n"; - - print PROT ">$locus \[$features{$locus}{'product'}\]\n"; #\t$contig\t$start\t$end\t$strand\n"; - foreach my $line (unpack("(A60)*",$sequence)){ - print PROT "$line\n"; } undef $translate; undef $sequence; $gene_num ++; + } } @@ -289,13 +321,26 @@ } + if ($verbose){ + my $total_contigs = $contig_kept_counter + $contig_discarded_counter; + print "\n".'Total contigs: '."\t\t\t".$total_contigs."\t".$contig_sum." bp\n"; + print "Contigs kept (>= $minsize): "."\t".$contig_kept_counter."\t".$contig_kept_sum." bp\n"; + print "Contigs discarded (< $minsize): "."\t".$contig_discarded_counter."\t".$contig_discarded_sum." bp\n"; + } + ### Creating genome FASTA from GBFF for my $sequence (sort(keys %genome)){ - print GEN ">$sequence\n"; - my @data = unpack ("(A60)*", $genome{$sequence}); - while (my $tmp = shift@data){ - print GEN "$tmp\n"; + + my $contig_len = length($genome{$sequence}); + + if ($contig_len >= $minsize){ + print GEN ">$sequence\n"; + my @data = unpack ("(A60)*", $genome{$sequence}); + while (my $tmp = shift@data){ + print GEN "$tmp\n"; + } } + } } @@ -307,4 +352,8 @@ } exit; } +} + +if ($verbose){ + print "\n"; } \ No newline at end of file diff --git a/run_syny.pl b/run_syny.pl index 2ae4abe..6790a9b 100755 --- a/run_syny.pl +++ b/run_syny.pl @@ -2,8 +2,8 @@ # Pombert lab, 2022 my $name = 'run_syny.pl'; -my $version = '0.6.7a'; -my $updated = '2024-05-07'; +my $version = '0.6.7b'; +my $updated = '2024-05-08'; use strict; use warnings; @@ -47,6 +47,7 @@ -o (--outdir) Output directory [Default = SYNY] -e (--evalue) DIAMOND BLASTP evalue cutoff [Default = 1e-10] -g (--gaps) Allowable number of gaps between gene pairs [Default = 0] +--minsize Minimum contig size (in bp) [Default: 1] --asm Specify minimap2 max divergence preset (--asm 5, 10 or 20) [Default: off] --resume Resume minimap2 computations (skip completed alignments) --no_map Skip minimap2 pairwise genome alignments @@ -119,6 +120,7 @@ my $outdir = 'SYNY'; my $threads = 16; my $max_pthreads = $threads; +my $minsize = 1; my $nomap; my $noclus; my $resume; @@ -187,6 +189,7 @@ 'o|outdir=s' => \$outdir, 'e|evalue=s' => \$evalue, 'g|gaps=i{0,}' => \@gaps, + 'minsize=i' => \$minsize, 'no_map' => \$nomap, 'no_clus' => \$noclus, 'resume' => \$resume, @@ -446,32 +449,10 @@ print LOG "SYNY started on: ".$time."\n"; print LOG "COMMAND: $0 @commands\n\n"; -################################################################################################### -## Run list_maker.pl -################################################################################################### - -my @threads = initThreads($threads); - -print ERROR "\n### list_maker.pl ###\n"; -print "\n##### Extracting data from GenBank files\n"; - -my @annotations :shared = @annot_files; -my $annot_num = scalar(@annot_files); - -for my $thread (@threads){ - $thread = threads->create(\&list_maker); -} -for my $thread (@threads){ - $thread ->join(); -} - -logs(\*LOG, 'Parsing data - list_maker.pl'); - ################################################################################################### ## Shared options flags ################################################################################################### -# Option flags my $cluster_flag = ''; if ($clusters){ $cluster_flag = '--clusters'; @@ -517,6 +498,28 @@ $hm_vauto_flag = '--vauto'; } +################################################################################################### +## Run list_maker.pl +################################################################################################### + +my @threads = initThreads($threads); + +print ERROR "\n### list_maker.pl ###\n"; +print "\n##### Extracting data from GenBank files\n"; + +my @annotations :shared = @annot_files; +my $annot_num = scalar(@annot_files); + +for my $thread (@threads){ + $thread = threads->create(\&list_maker); +} +for my $thread (@threads){ + $thread ->join(); +} + +logs(\*LOG, 'Parsing data - list_maker.pl'); + + ################################################################################################### ## Get PAF files with minimap2 ################################################################################################### @@ -1146,6 +1149,7 @@ $plot_path/nucleotide_biases.pl \\ --outdir $circos_data_dir \\ --fasta $genome_dir/*.fasta \\ + --minsize $minsize \\ --winsize $winsize \\ --step $stepsize \\ --gap $gap \\ @@ -1323,6 +1327,7 @@ sub list_maker { $util_path/list_maker.pl \\ --input $annotation \\ --outdir $outdir \\ + --minsize $minsize \\ 2>> $log_err ") == 0 or checksig();