Skip to content

Commit

Permalink
Mfannot - big improvement in handling this conversion (#483)
Browse files Browse the repository at this point in the history
* add group_ii_intron and orf in feature_levels.yaml

* fix MFannot #455 + add a new test to be sure feature from mfannot start at 1
* rewrite most of the script to use omniscient and better define features
* Fix case in checkSequential in OmniscientI where no L1 and l2 defined after L3.
* add sort for input hash In OmniscientI to preserve reproducibility
  • Loading branch information
Juke34 authored Sep 9, 2024
1 parent c2d8595 commit ead00b3
Show file tree
Hide file tree
Showing 8 changed files with 845 additions and 581 deletions.
610 changes: 288 additions & 322 deletions bin/agat_convert_mfannot2gff.pl

Large diffs are not rendered by default.

67 changes: 49 additions & 18 deletions lib/AGAT/OmniscientI.pm
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ sub slurp_gff3_file_JD {
# ============================> HASH CASE <============================
elsif(ref($file) eq 'HASH'){

foreach my $level (keys %{$file}){
foreach my $level ( sort { ncmp ($a, $b) } keys %{$file}){
# save header if any
if ($level eq 'other'){
foreach my $thing( keys %{$file->{'other'} } ){
Expand All @@ -301,10 +301,10 @@ sub slurp_gff3_file_JD {
next;
}
if ( ref($file->{$level}) eq 'HASH'){ #Header,level1,level2,#level3
foreach my $tag (keys %{$file->{$level}}){
foreach my $id (keys %{$file->{$level}{$tag}}){
foreach my $tag ( sort { ncmp ($a, $b) } keys %{$file->{$level}}){
foreach my $id ( sort { ncmp ($a, $b) } keys %{$file->{$level}{$tag}}){
if ( ref($file->{$level}{$tag}{$id}) eq 'ARRAY'){ #level2,#level3
foreach my $feature ( @{$file->{$level}{$tag}{$id} }){
foreach my $feature ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$file->{$level}{$tag}{$id} }){
($locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new) =
manage_one_feature($ontology, $feature, \%omniscient, \%mRNAGeneLink, \%duplicate, \%hashID, \%locusTAG, \%infoSequential, \%attachedL2Sequential, $locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new, $verbose, $log, $debug);
}
Expand All @@ -318,7 +318,7 @@ sub slurp_gff3_file_JD {
}
}
else{ #extra list of feature
foreach my $feature ( @{$file->{$level}} ) {
foreach my $feature ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$file->{$level}} ) {
($locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new) =
manage_one_feature($ontology, $feature, \%omniscient, \%mRNAGeneLink, \%duplicate, \%hashID, \%locusTAG, \%infoSequential, \%attachedL2Sequential, $locusTAGvalue, $last_l1_f, $last_l2_f, $last_l3_f, $last_f, $lastL1_new, $verbose, $log, $debug);
}
Expand Down Expand Up @@ -952,7 +952,7 @@ sub manage_one_feature{
if(!$last_l1_f and $last_l3_f){ #particular case : Two l3 that follow each other, but first one has locus_tag but not the second
dual_print ($log, "::::::::::Push-L3-sequential-2 $last_locusTAGvalue || ".lc($l2_id)." || level3 == ".$feature->gff_string."\n", $verbose) if ($debug);
push( @{$infoSequential->{'locus'}{lc($last_locusTAGvalue)}{lc($l2_id)}{'level3'}}, $feature );
$infoSequential->{'id'}{lc($l2_id)} = $l2_id;
$infoSequential->{'id'}{lc($l2_id)} = $l2_id;#}
return $last_locusTAGvalue, $last_l1_f, $last_l2_f, $feature, $feature, $lastL1_new;
}
else{
Expand Down Expand Up @@ -2842,21 +2842,47 @@ sub _deinterleave_sequential{
dual_print($log, "$resume_case cases of interleaved locus. ( Features of the locus data are defined earlier in the file.\n", $verbose) if($resume_case);
}

# Same locus but L3 and L2 have been put in two different buckets because L2 was met later in the file another parent have been created by AGAT
sub _fixL2andL3inDifferentBucketBecauseL2wasMetLaterInTheFile{
my ($infoSequential) = @_;
my $resume_case=undef;

foreach my $locus ( sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}} ){
foreach my $bucket (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locus} } ){
if( exists_keys($infoSequential,("locus", $locus, $bucket, 'level3') ) and ! exists_keys($infoSequential,("locus", $locus, $bucket, 'level2') ) ){

foreach my $bucket2 (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locus}} ){
if ($bucket ne $bucket2){
if (exists_keys($infoSequential,("locus", $locus, $bucket2, 'level2') ) ){
#Push to allow to add different bucket in the level2 bucket
foreach my $feature ( sort { ncmp ($a->start."|".$a->end.$a->_tag_value('ID'), $b->start."|".$b->end.$b->_tag_value('ID') ) } @{$infoSequential->{'locus'}{ $locus }{$bucket}{'level3'} } ){
push( @{$infoSequential->{'locus'}{lc($locus)}{lc($bucket2)}{'level3'}}, $feature );
}
delete $infoSequential->{'locus'}{ $locus }{$bucket}{'level3'};
if( !%{$infoSequential->{'locus'}{ $locus }{$bucket}} ) { delete $infoSequential->{'locus'}{ $locus }{$bucket}; }# remove this hash if empty
}
}
}
}
}
}
}

#
#
# BUCKET = LOCUS_ID
# All Level1 feature are for sure in omniscient, we have only the ID in sequential,
# other level feature are in sequential only if no parent has been found
sub _check_sequential{ # Goes through from L3 to l1
my ($debug, $log, $infoSequential, $omniscient, $hashID, $locusTAG_uniq, $mRNAGeneLink, $verbose) = @_;
my $resume_case_l2=0;
my $resume_case_l3=0;


_deinterleave_sequential($infoSequential, $locusTAG_uniq, $verbose, $log); # PART OF LOCUS LOST BEFORE TO MEET ITS L2 or L1 ... we catch them and re-link everything as it should be
_fixL2andL3inDifferentBucketBecauseL2wasMetLaterInTheFile( $infoSequential);

foreach my $locusNameHIS ( sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}} ){ #comon tag was l1 id when no real comon tag present

foreach my $bucket (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locusNameHIS} } ){ #bucket = level1 or Id L2
foreach my $bucket (sort { ncmp($a,$b) } keys %{$infoSequential->{'locus'}{$locusNameHIS} } ){ #bucket = level1 or Id L2

dual_print($log, "\nlocusNameHIS $locusNameHIS bucket $bucket\n", $verbose) if($debug);

Expand All @@ -2865,7 +2891,7 @@ sub _check_sequential{ # Goes through from L3 to l1
my $must_create_l2=undef;
my $feature_l2 = undef;

#Bucket is an uniq ID created during the reading process. So it can be used as uniq ID.
# NO LEVEL3 FEATURE FOR THIS BUCKET/LOCUS
if(! exists_keys($infoSequential,("locus", $locusNameHIS, $bucket, 'level3') ) ){

# Link the l2 to the L1 feature
Expand Down Expand Up @@ -2903,6 +2929,7 @@ sub _check_sequential{ # Goes through from L3 to l1
#We cannot guess the structure except if it is prokaryote or single exon in eucaryote.
}

# THIS LOCUS HAS LEVEL3 FEATURE
else{
foreach my $feature_L3 (@{$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level3'}} ){

Expand All @@ -2928,7 +2955,7 @@ sub _check_sequential{ # Goes through from L3 to l1
}
}
$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level2'} = $feature_l2;
}
}
#If locus_tag check from omniscient if feature has same locus tag
elsif ( $common_tag and ( exists_keys($locusTAG_uniq,('topfeature', lc($common_tag) ) ) ) and (! exists_keys($locusTAG_uniq,('topfeature', lc($common_tag),'level1') ) ) ) {
my $id_level2 = undef;
Expand All @@ -2954,13 +2981,17 @@ sub _check_sequential{ # Goes through from L3 to l1

#manage primary tag
my $primary_tag_l2='RNA';
foreach my $feature_L3 (@{$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level3'}} ){
if ( $feature_l2->has_tag('agat_parent_type') ){
$primary_tag_l2 = $feature_l2->_tag_value('agat_parent_type');
} else {
foreach my $feature_L3 (@{$infoSequential->{'locus'}{$locusNameHIS}{$bucket}{'level3'}} ){

if ( lc($feature_L3->primary_tag) eq 'cds'){
$primary_tag_l2 ='mRNA';
last;
}
}
if ( lc($feature_L3->primary_tag) eq 'cds'){
$primary_tag_l2 ='mRNA';
last;
}
}
}
$feature_l2->primary_tag($primary_tag_l2);

#Manage ID
Expand Down
3 changes: 2 additions & 1 deletion lib/AGAT/OmniscientO.pm
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,8 @@ sub write_headers{
if (exists_keys( $omniscient, ('other', 'header') ) ){
my $gffXtra=$gffout->{"_filehandle"}; #to add extra lines to gff!!
foreach my $header_line ( @{$omniscient->{'other'}{'header'} } ) {
print $gffXtra $header_line;
$header_line =~ s/[\r\n]+$//g;
print $gffXtra $header_line."\n";
}
}
# to avoid to write again the header later in the file
Expand Down
4 changes: 3 additions & 1 deletion share/feature_levels.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ level1:
est_match: 1
expressed_sequence_match: 1
gene: 1
group_ii_intron: standalone
hat_tir_transposon: standalone
helitron: standalone
insulator: standalone
Expand All @@ -61,6 +62,7 @@ level1:
non_allelic_homologous_recombination_region: standalone
nucleotide_motif: 1
nucleotide_to_protein_match: 1
orf: 1
pif_harbinger_tir_transposon: standalone
pirna_gene: 1
polypeptide: 1
Expand Down Expand Up @@ -203,7 +205,7 @@ spread:
stop_codon: 1
three_prime_utr: 1
utr: 1
uORF: 1
uorf: 1
3utr: 1
5utr: 1

Expand Down
7 changes: 7 additions & 0 deletions t/scripts_output.t
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,13 @@ system(" $script --mfannot $input_folder/test.mfannot -o $outtmp 2>&1 1>/dev/nul
ok( system("diff $result $outtmp") == 0, "output $script");
unlink $outtmp;

$script = $script_prefix."bin/agat_convert_mfannot2gff.pl";
$result = "$output_folder/agat_convert_mfannot2gff_2.gff";
system(" $script --mfannot $input_folder/test.mfannot2 -o $outtmp 2>&1 1>/dev/null");
#run test
ok( system("diff $result $outtmp") == 0, "output $script");
unlink $outtmp;

# -------------------------- check agat_convert_minimap2_bam2gff.pl --------------------------
$script = $script_prefix."bin/agat_convert_minimap2_bam2gff.pl";
$result = "$output_folder/agat_convert_minimap2_bam2gff_1.gff";
Expand Down
Loading

0 comments on commit ead00b3

Please sign in to comment.