Mfannot - big improvement in handling this conversion (#483)

* add group_ii_intron and orf in feature_levels.yaml * fix MFannot #455 + add a new test to be sure feature from mfannot start at 1 * rewrite most of the script to use omniscient and better define features * Fix case in checkSequential in OmniscientI where no L1 and l2 defined after L3. * add sort for input hash In OmniscientI to preserve reproducibility
NBISweden · Sep 9, 2024 · ead00b3 · ead00b3
1 parent c2d8595
commit ead00b3
Show file tree

Hide file tree

Showing 8 changed files with 845 additions and 581 deletions.
diff --git a/bin/agat_convert_mfannot2gff.pl b/bin/agat_convert_mfannot2gff.pl
diff --git a/lib/AGAT/OmniscientI.pm b/lib/AGAT/OmniscientI.pm
@@ -285,7 +285,7 @@ sub slurp_gff3_file_JD {
 	# ============================> HASH CASE <============================
 	elsif(ref($file) eq 'HASH'){
 
-		foreach my $level (keys %{$file}){
+		foreach my $level ( sort { ncmp ($a, $b) } keys %{$file}){
 			# save header if any
 			if ($level eq 'other'){
 				foreach my $thing( keys %{$file->{'other'} } ){
@@ -301,10 +301,10 @@ sub slurp_gff3_file_JD {
 				next;
 			}
 			if ( ref($file->{$level}) eq 'HASH'){ #Header,level1,level2,#level3
-				foreach my $tag (keys %{$file->{$level}}){
-					foreach my $id (keys %{$file->{$level}{$tag}}){
+				foreach my $tag ( sort { ncmp ($a, $b) } keys %{$file->{$level}}){
+					foreach my $id ( sort { ncmp ($a, $b) } keys %{$file->{$level}{$tag}}){
 						if ( ref($file->{$level}{$tag}{$id}) eq 'ARRAY'){ #level2,#level3
-							foreach my $feature ( @{$file->{$level}{$tag}{$id} }){
+							foreach my $feature (  sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$file->{$level}{$tag}{$id} }){
 								($locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new) =
 										manage_one_feature($ontology, $feature, \%omniscient, \%mRNAGeneLink, \%duplicate, \%hashID, \%locusTAG, \%infoSequential, \%attachedL2Sequential, $locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new, $verbose, $log, $debug);
 							}
@@ -318,7 +318,7 @@ sub slurp_gff3_file_JD {
 				}
 			}
 			else{ #extra list of feature
-				foreach my $feature ( @{$file->{$level}} ) {
+				foreach my $feature (  sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$file->{$level}} ) {
 					($locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new) =
 							manage_one_feature($ontology, $feature, \%omniscient, \%mRNAGeneLink, \%duplicate, \%hashID, \%locusTAG, \%infoSequential, \%attachedL2Sequential, $locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new, $verbose, $log, $debug);
 	 				}
@@ -952,7 +952,7 @@ sub manage_one_feature{
 										if(!$last_l1_f and $last_l3_f){ #particular case : Two l3 that follow each other, but first one has locus_tag but not the second
 												dual_print ($log, "::::::::::Push-L3-sequential-2 $last_locusTAGvalue || ".lc($l2_id)." || level3 == ".$feature->gff_string."\n", $verbose) if ($debug);
 												push( @{$infoSequential->{'locus'}{lc($last_locusTAGvalue)}{lc($l2_id)}{'level3'}}, $feature );
-												$infoSequential->{'id'}{lc($l2_id)} = $l2_id;
+												$infoSequential->{'id'}{lc($l2_id)} = $l2_id;#}
 												return $last_locusTAGvalue, $last_l1_f, $last_l2_f, $feature, $feature, $lastL1_new;
 										}
 										else{
@@ -2842,21 +2842,47 @@ sub _deinterleave_sequential{
 	dual_print($log, "$resume_case cases of interleaved locus. ( Features of the locus data are defined earlier in the file.\n", $verbose) if($resume_case);
 }
 
+# Same locus but L3 and L2 have been put in two different buckets because L2 was met later in the file another parent have been created by AGAT
+sub _fixL2andL3inDifferentBucketBecauseL2wasMetLaterInTheFile{
+	my ($infoSequential) = @_;
+	my $resume_case=undef;
+
+	foreach my $locus ( sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}} ){
+		foreach my $bucket (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locus} } ){
+			if( exists_keys($infoSequential,("locus", $locus, $bucket, 'level3') ) and ! exists_keys($infoSequential,("locus", $locus, $bucket, 'level2') ) ){
+
+				foreach my $bucket2 (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locus}} ){
+					if ($bucket ne $bucket2){
+						if (exists_keys($infoSequential,("locus", $locus, $bucket2, 'level2') ) ){
+							#Push to allow to add different bucket in the level2 bucket
+							foreach my $feature ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$infoSequential->{'locus'}{ $locus }{$bucket}{'level3'} } ){
+								push( @{$infoSequential->{'locus'}{lc($locus)}{lc($bucket2)}{'level3'}}, $feature );
+							}
+							delete $infoSequential->{'locus'}{ $locus }{$bucket}{'level3'};
+							if( !%{$infoSequential->{'locus'}{ $locus }{$bucket}} ) {  delete $infoSequential->{'locus'}{ $locus }{$bucket};  }# remove this hash if empty
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
 #
-#
+# BUCKET = LOCUS_ID
 # All Level1 feature are for sure in omniscient, we have only the ID in sequential,
 # other level feature are in sequential only if no parent has been found
 sub _check_sequential{ # Goes through from L3 to l1
  	my ($debug, $log, $infoSequential, $omniscient, $hashID, $locusTAG_uniq, $mRNAGeneLink, $verbose) = @_;
  	my $resume_case_l2=0;
  	my $resume_case_l3=0;
 
-
  	_deinterleave_sequential($infoSequential, $locusTAG_uniq, $verbose, $log); # PART OF LOCUS LOST BEFORE TO MEET ITS L2 or L1 ... we catch them and re-link everything as it should be
+	_fixL2andL3inDifferentBucketBecauseL2wasMetLaterInTheFile( $infoSequential);
 
  	foreach my $locusNameHIS ( sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}} ){ #comon tag was l1 id when no real comon tag present
-
- 		foreach my $bucket (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locusNameHIS} } ){ #bucket = level1 or Id L2
+				
+		foreach my $bucket (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locusNameHIS} } ){ #bucket = level1 or Id L2
 
 			dual_print($log, "\nlocusNameHIS $locusNameHIS bucket $bucket\n", $verbose) if($debug);
 
@@ -2865,7 +2891,7 @@ sub _check_sequential{ # Goes through from L3 to l1
  			my $must_create_l2=undef;
  			my $feature_l2 = undef;
 
-			#Bucket is an uniq ID created during the reading process. So it can be used as uniq ID.
+			# NO LEVEL3 FEATURE FOR THIS BUCKET/LOCUS
  			if(! exists_keys($infoSequential,("locus", $locusNameHIS, $bucket, 'level3') ) ){
 
 	 				# Link the l2 to the L1 feature
@@ -2903,6 +2929,7 @@ sub _check_sequential{ # Goes through from L3 to l1
 					#We cannot guess the structure except if it is prokaryote or single exon in eucaryote.
  			}
 
+			# THIS LOCUS HAS LEVEL3 FEATURE
 			else{
  				foreach my $feature_L3 (@{$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level3'}} ){
 
@@ -2928,7 +2955,7 @@ sub _check_sequential{ # Goes through from L3 to l1
 				 						}
 			 						}
 		 							$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level2'} = $feature_l2;
-								}
+								} 
 								#If locus_tag check from omniscient if feature has same locus tag
 								elsif ( $common_tag and ( exists_keys($locusTAG_uniq,('topfeature', lc($common_tag) ) ) ) and (! exists_keys($locusTAG_uniq,('topfeature', lc($common_tag),'level1') ) ) ) {
 										my $id_level2 = undef;
@@ -2954,13 +2981,17 @@ sub _check_sequential{ # Goes through from L3 to l1
 
 									#manage primary tag
 									my $primary_tag_l2='RNA';
-									foreach my $feature_L3 (@{$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level3'}} ){
+									if ( $feature_l2->has_tag('agat_parent_type') ){
+										$primary_tag_l2 = $feature_l2->_tag_value('agat_parent_type');
+									} else {
+										foreach my $feature_L3 (@{$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level3'}} ){
 
-			 							if ( lc($feature_L3->primary_tag) eq 'cds'){
-			 								$primary_tag_l2 ='mRNA';
-			 								last;
-			 							}
-			 						}
+											if ( lc($feature_L3->primary_tag) eq 'cds'){
+												$primary_tag_l2 ='mRNA';
+												last;
+											}
+										}
+									}
 			 						$feature_l2->primary_tag($primary_tag_l2);
 
 		 							#Manage ID

diff --git a/lib/AGAT/OmniscientO.pm b/lib/AGAT/OmniscientO.pm
@@ -608,7 +608,8 @@ sub write_headers{
     if (exists_keys( $omniscient, ('other', 'header') ) ){
       my $gffXtra=$gffout->{"_filehandle"}; #to add extra lines to gff!!
       foreach my $header_line ( @{$omniscient->{'other'}{'header'} } ) {
-        print $gffXtra $header_line;
+		$header_line =~ s/[\r\n]+$//g;
+        print $gffXtra $header_line."\n";
       }
     }
     # to avoid to write again the header later in the file

diff --git a/share/feature_levels.yaml b/share/feature_levels.yaml
@@ -41,6 +41,7 @@ level1:
   est_match: 1
   expressed_sequence_match: 1
   gene: 1
+  group_ii_intron: standalone
   hat_tir_transposon: standalone
   helitron: standalone
   insulator: standalone
@@ -61,6 +62,7 @@ level1:
   non_allelic_homologous_recombination_region: standalone
   nucleotide_motif: 1
   nucleotide_to_protein_match: 1
+  orf: 1
   pif_harbinger_tir_transposon: standalone
   pirna_gene: 1
   polypeptide: 1
@@ -203,7 +205,7 @@ spread:
    stop_codon: 1
    three_prime_utr: 1
    utr: 1
-   uORF: 1
+   uorf: 1
    3utr: 1
    5utr: 1
 

diff --git a/t/scripts_output.t b/t/scripts_output.t
@@ -89,6 +89,13 @@ system(" $script --mfannot $input_folder/test.mfannot -o $outtmp 2>&1 1>/dev/nul
 ok( system("diff $result $outtmp") == 0, "output $script");
 unlink $outtmp;
 
+$script = $script_prefix."bin/agat_convert_mfannot2gff.pl";
+$result = "$output_folder/agat_convert_mfannot2gff_2.gff";
+system(" $script --mfannot $input_folder/test.mfannot2 -o $outtmp 2>&1 1>/dev/null");
+#run test
+ok( system("diff $result $outtmp") == 0, "output $script");
+unlink $outtmp;
+
 # -------------------------- check agat_convert_minimap2_bam2gff.pl --------------------------
 $script = $script_prefix."bin/agat_convert_minimap2_bam2gff.pl";
 $result = "$output_folder/agat_convert_minimap2_bam2gff_1.gff";