decouple AUGUSTUS_CONFIG_PATH from scripts, allows for flexibility wi…

…th Augustus install
nextgenusfs · Jan 29, 2019 · 30c1166 · 30c1166
1 parent 29d23d0
commit 30c1166
Show file tree

Hide file tree

Showing 3 changed files with 68 additions and 32 deletions.
diff --git a/bin/augustus_parallel.py b/bin/augustus_parallel.py
@@ -148,11 +148,23 @@ def runAugustus(Input):
         file = os.path.join(tmpdir, file+'.augustus.gff3')
         with open(file) as input:
             output.write(input.read())
+
+if lib.checkannotations(os.path.join(tmpdir, 'augustus_all.gff3')):
+    lib.log.debug('Augustus finished, now joining results')
+if lib.which_path('join_aug_pred.pl'):
+    join_script = 'join_aug_pred.pl'
+else:
+    join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl')
 
-join_script = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_aug_pred.pl')
+cmd = '{:} < {:} > {:}'.format(join_script, os.path.join(tmpdir, 'augustus_all.gff3'), args.out)
+lib.log.debug(cmd)
+
 with open(args.out, 'w') as finalout:
-    with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'rU') as input:
-        subprocess.call([join_script],stdin = input, stdout = finalout)
+    with open(os.path.join(tmpdir, 'augustus_all.gff3'), 'rU') as infile:
+        subprocess.call([join_script], stdin = infile, stdout = finalout)
+
+#subprocess.call([cmd], shell=True)
+
 if not args.debug:
     shutil.rmtree(tmpdir)
 lib.log.info('Found {0:,}'.format(countGFFgenes(args.out))+' gene models')
diff --git a/bin/funannotate-predict.py b/bin/funannotate-predict.py
@@ -99,7 +99,7 @@ def which_path(file_name):
 FNULL = open(os.devnull, 'w')
 cmd_args = " ".join(sys.argv)+'\n'
 lib.log.debug(cmd_args)
-print("-------------------------------------------------------")
+sys.stderr.write("-------------------------------------------------------\n")
 lib.SystemInfo()
 
 #get version of funannotate
@@ -178,7 +178,18 @@ def which_path(file_name):
 
 if os.path.basename(os.path.normcase(os.path.abspath(AUGUSTUS))) == 'config':
     AUGUSTUS_BASE = os.path.dirname(os.path.abspath(AUGUSTUS))
-BAM2HINTS = os.path.join(AUGUSTUS_BASE, 'bin', 'bam2hints')
+if lib.which('bam2hints'):
+    BAM2HINTS = 'bam2hints'
+else:
+    BAM2HINTS = os.path.join(AUGUSTUS_BASE, 'bin', 'bam2hints')
+if lib.which_path('join_mult_hints.pl'):
+    JOINHINTS = 'join_mult_hints.pl'
+else:
+    JOINHINTS = os.path.join(AUGUSTUS_BASE, 'scripts', 'join_mult_hints.pl')
+if lib.which('gff2gbSmallDNA.pl'):
+    GFF2GB = 'gff2gbSmallDNA.pl'
+else:
+    GFF2GB = os.path.join(AUGUSTUS_BASE, 'scripts', 'gff2gbSmallDNA.pl')
 GeneMark2GFF = os.path.join(parentdir, 'util', 'genemark_gtf2gff3.pl')
 try:
     GENEMARKCMD = os.path.join(GENEMARK_PATH, 'gmes_petap.pl')
@@ -274,7 +285,7 @@ def which_path(file_name):
 system_os = lib.systemOS()
 if args.rna_bam:
     if augustuscheck[1] == 0:
-        lib.log.error("ERROR: %s is not installed properly for BRAKER (check bam2hints compilation)" % augustuscheck[0])
+        lib.log.error("ERROR: %s is not installed properly for BRAKER (check bam2hints/filterBam compilation)" % augustuscheck[0])
         sys.exit(1)
 
 if not augspeciescheck: #means training needs to be done
@@ -310,7 +321,7 @@ def which_path(file_name):
 #convert PASA GFF and/or GFF pass-through
 #convert PASA to have 'pasa_pred' in second column to make sure weights work with EVM
 PASA_GFF = os.path.join(args.out, 'predict_misc', 'pasa_predictions.gff3')
-PASA_weight = '10'
+PASA_weight = '6'
 if args.pasa_gff:
     if ':' in args.pasa_gff:
         pasacol = args.pasa_gff.split(':')
@@ -544,7 +555,10 @@ def which_path(file_name):
                     cmd = ['sort', '-s', '-k', '14,14', blat_sort1]
                     lib.runSubprocess2(cmd, '.', lib.log, blat_sort2)
                     #run blat2hints
-                    blat2hints = os.path.join(AUGUSTUS_BASE, 'scripts', 'blat2hints.pl')
+                    if lib.which('blat2hints.pl'):
+                        blat2hints = 'blat2hints.pl'
+                    else:
+                        blat2hints = os.path.join(AUGUSTUS_BASE, 'scripts', 'blat2hints.pl')
                     cmd = [blat2hints, b2h_input, b2h_output, '--minintronlen=20', '--trunkSS']
                     lib.runSubprocess(cmd, '.', lib.log)
                     total = lib.line_count(blat_sort2)
@@ -578,10 +592,10 @@ def which_path(file_name):
             lib.sortHints(bamhintstmp, bamhintssorted)
             #join hints
             bamjoinedhints = os.path.join(args.out, 'predict_misc', 'bam_hints.joined.tmp')
-            cmd = ['perl', os.path.join(AUGUSTUS_BASE, 'scripts', 'join_mult_hints.pl')]
+            cmd = [JOINHINTS]
             lib.runSubprocess5(cmd, '.', lib.log, bamhintssorted, bamjoinedhints)
             #filter intron hints
-            cmd = ['perl', os.path.join(parentdir, 'util', 'BRAKER', 'filterIntronsFindStrand.pl'), MaskGenome, bamjoinedhints, '--score']
+            cmd = [os.path.join(parentdir, 'util', 'BRAKER', 'filterIntronsFindStrand.pl'), MaskGenome, bamjoinedhints, '--score']
             lib.runSubprocess2(cmd, '.', lib.log, hintsBAM)
         else:
             lib.log.info("Existing RNA-seq BAM hints found: {:}".format(hintsBAM))
@@ -636,8 +650,7 @@ def which_path(file_name):
     #now sort hints file, and join multiple hints_all
     allhintstmp_sort = os.path.join(args.out, 'predict_misc', 'hints.all.sort.tmp')
     lib.sortHints(allhintstmp, allhintstmp_sort)
-    #lib.joinFilterHints(os.path.join(AUGUSTUS_BASE, 'scripts', 'join_mult_hints.pl'),os.path.join(parentdir, 'util', 'BRAKER', 'filterIntronsFindStrand.pl'), allhintstmp_sort, hints_all)
-    cmd = ['perl', os.path.join(AUGUSTUS_BASE, 'scripts', 'join_mult_hints.pl')]
+    cmd = [JOINHINTS]
     lib.runSubprocess5(cmd, '.', lib.log, allhintstmp_sort, hints_all)
 
     Augustus, GeneMark = (None,)*2
@@ -718,7 +731,11 @@ def which_path(file_name):
             #convert PASA GFF to GTF format
             lib.gff3_to_gtf(PASA_GFF, MaskGenome, trainingModels)
             #now get best models by cross-ref with intron BAM hints
-            cmd = [os.path.join(parentdir, 'util', 'BRAKER', 'filterGenemark.pl'), os.path.abspath(trainingModels), os.path.abspath(hints_all)]
+            if lib.which('filterGenemark.pl'):
+                FILTERGENE = 'filterGenemark.pl'
+            else:
+                FILTERGENE = os.path.join(parentdir, 'util', 'BRAKER', 'filterGenemark.pl')
+            cmd = [FILTERGENE, os.path.abspath(trainingModels), os.path.abspath(hints_all)]
             lib.runSubprocess4(cmd, os.path.join(args.out, 'predict_misc'), lib.log)
             totalTrain = lib.selectTrainingModels(PASA_GFF, MaskGenome, os.path.join(args.out, 'predict_misc', 'pasa.training.tmp.f.good.gtf'), finalModels)
             if totalTrain < args.min_training_models:
@@ -728,7 +745,6 @@ def which_path(file_name):
                 numTrainingSet = round(totalTrain * 0.10)
             else:
                 numTrainingSet = 100
-            GFF2GB = os.path.join(AUGUSTUS_BASE, 'scripts', 'gff2gbSmallDNA.pl')
             trainingset = os.path.join(args.out, 'predict_misc', 'augustus.pasa.gb')
             cmd = [GFF2GB, finalModels, MaskGenome, '600', trainingset]
             lib.runSubprocess(cmd, '.', lib.log)
@@ -1010,7 +1026,6 @@ def which_path(file_name):
             else:
                 numTrainingSet = 100
             lib.log.info("Training Augustus using BUSCO gene models")
-            GFF2GB = os.path.join(AUGUSTUS_BASE, 'scripts', 'gff2gbSmallDNA.pl')
             trainingset = os.path.join(args.out, 'predict_misc', 'busco.training.gb')
             cmd = [GFF2GB, trainingModels, MaskGenome, '600', trainingset]
             lib.runSubprocess(cmd, '.', lib.log)
@@ -1163,17 +1178,17 @@ def which_path(file_name):
         if not 'augustus' in EVMWeights:
             EVMWeights['augustus'] = '1'
         if os.path.isfile(hints_all):
-            output.write("OTHER_PREDICTION\tHiQ\t5\n")
+            output.write("OTHER_PREDICTION\tHiQ\t2\n")
             if not 'hiq' in EVMWeights:
-                EVMWeights['hiq'] = '5'
+                EVMWeights['hiq'] = '2'
         if args.pasa_gff:
             output.write("OTHER_PREDICTION\tpasa_pred\t%s\n" % PASA_weight)
             if not 'pasa' in EVMWeights:
                 EVMWeights['pasa'] = PASA_weight
         if Quarry: #set coding quarry to same as PASA weight
-            output.write("OTHER_PREDICTION\tCodingQuarry\t%s\n" % PASA_weight)
+            output.write("OTHER_PREDICTION\tCodingQuarry\t%s\n" % '2')
             if not 'CodingQuarry' in EVMWeights:
-                EVMWeights['CodingQuarry'] = PASA_weight
+                EVMWeights['CodingQuarry'] = '2'
         if Exonerate:
             output.write("PROTEIN\texonerate\t1\n")
         if Transcripts:
@@ -1195,16 +1210,16 @@ def which_path(file_name):
     if k in EVMWeights:
         eviweight = EVMWeights.get(k)
     if k == 'hiq':
-        print('{:} models ({:}):\t\t{:^>,}'.format('HiQ', eviweight, v))
+        sys.stderr.write('{:} models ({:}):\t\t{:^>,}\n'.format('HiQ', eviweight, v))
     elif k == 'pasa' and v > 0:
-        print('{:} models ({:}):\t{:^>,}'.format('PASA', eviweight, v))
+        sys.stderr.write('{:} models ({:}):\t{:^>,}\n'.format('PASA', eviweight, v))
     elif k == 'CodingQuarry':
-        print('{:} models ({:}):\t{:^>,}'.format('CodeQuarry', eviweight, v))
+        sys.stderr.write('{:} models ({:}):\t{:^>,}\n'.format('CodeQuarry', eviweight, v))
     elif k == 'total':
-        print('{:} models:\t\t{:^>,}'.format(k.capitalize(), v))
+        sys.stderr.write('{:} models:\t\t{:^>,}\n'.format(k.capitalize(), v))
     else:
-        print('{:} models ({:}):\t{:^>,}'.format(k.capitalize(), eviweight, v))
-print('-------------------------------------------------------')
+        sys.stderr.write('{:} models ({:}):\t{:^>,}\n'.format(k.capitalize(), eviweight, v))
+sys.stderr.write('-------------------------------------------------------\n')
 
 if args.keep_evm and os.path.isfile(EVM_out):
     lib.log.info("Using existing EVM predictions")
@@ -1347,7 +1362,7 @@ def which_path(file_name):
 #check if there are error that need to be fixed
 errors = lib.ncbiCheckErrors(final_error, final_validation, prefix, final_fixes)
 if errors > 0:
-    print('-------------------------------------------------------')
+    sys.stderr.write('-------------------------------------------------------\n')
     lib.log.info("Manually edit the tbl file %s, then run:\n\nfunannotate fix -i %s -t %s\n" % (final_tbl, final_gbk, final_tbl))
     lib.log.info("After the problematic gene models are fixed, you can proceed with functional annotation.")
 

diff --git a/lib/library.py b/lib/library.py
@@ -5969,9 +5969,18 @@ def getGenesGTF(input):
     return genes
 
 def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpus, num_training, optimize):
-    RANDOMSPLIT = os.path.join(AUGUSTUS_BASE, 'scripts', 'randomSplit.pl')
-    OPTIMIZE = os.path.join(AUGUSTUS_BASE, 'scripts', 'optimize_augustus.pl')
-    NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl')
+    if which('randomSplit.pl'):
+        RANDOMSPLIT = 'randomSplit.pl'
+    else:
+        RANDOMSPLIT = os.path.join(AUGUSTUS_BASE, 'scripts', 'randomSplit.pl')
+    if which('optimize_augustus.pl'):
+        OPTIMIZE = 'optimize_augustus.pl'
+    else:
+        OPTIMIZE = os.path.join(AUGUSTUS_BASE, 'scripts', 'optimize_augustus.pl')
+    if which('new_species.pl'):
+        NEW_SPECIES = 'new_species.pl'
+    else:
+        NEW_SPECIES = os.path.join(AUGUSTUS_BASE, 'scripts', 'new_species.pl')
     aug_cpus = '--cpus='+str(cpus)
     species = '--species='+train_species
     aug_log = os.path.join(outdir, 'logfiles', 'augustus_training.log')
@@ -5981,18 +5990,18 @@ def trainAugustus(AUGUSTUS_BASE, train_species, trainingset, genome, outdir, cpu
     trainingdir = os.path.join(outdir, 'predict_misc', 'tmp_opt_'+train_species)
     with open(aug_log, 'w') as logfile:
         if not CheckAugustusSpecies(train_species):
-            subprocess.call(['perl', NEW_SPECIES, species], stdout = logfile, stderr = logfile)
+            subprocess.call([NEW_SPECIES, species], stdout = logfile, stderr = logfile)
         #run etraining again to only use best models from EVM for training
         subprocess.call(['etraining', species, TrainSet], cwd = os.path.join(outdir, 'predict_misc'), stderr = logfile, stdout = logfile)
-        subprocess.call(['perl', RANDOMSPLIT, TrainSet, str(num_training)], cwd = os.path.join(outdir, 'predict_misc')) #split off num_training models for testing purposes
+        subprocess.call([RANDOMSPLIT, TrainSet, str(num_training)], cwd = os.path.join(outdir, 'predict_misc')) #split off num_training models for testing purposes
         if os.path.isfile(os.path.join(outdir, 'predict_misc', TrainSet+'.train')):
             with open(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'), 'w') as initialtraining:
                 subprocess.call(['augustus', species, TrainSet+'.test'], stdout=initialtraining, cwd = os.path.join(outdir, 'predict_misc'))
             train_results = getTrainResults(os.path.join(outdir, 'predict_misc', 'augustus.initial.training.txt'))
             log.info('Augustus initial training results (specificity/sensitivity):\nnucleotides ({:.1%}/{:.1%}); exons ({:.1%}/{:.1%}); genes ({:.1%}/{:.1%}).'.format(train_results[0],train_results[1],train_results[2],train_results[3],train_results[4],train_results[5]))
             if optimize:
                 #now run optimization
-                subprocess.call(['perl', OPTIMIZE, species, aug_cpus, onlytrain, testtrain], cwd = os.path.join(outdir, 'predict_misc'), stderr = logfile, stdout = logfile)
+                subprocess.call([OPTIMIZE, species, aug_cpus, onlytrain, testtrain], cwd = os.path.join(outdir, 'predict_misc'), stderr = logfile, stdout = logfile)
                 #run etraining again
                 subprocess.call(['etraining', species, TrainSet], cwd = os.path.join(outdir, 'predict_misc'), stderr = logfile, stdout = logfile)
                 with open(os.path.join(outdir, 'predict_misc', 'augustus.final.training.txt'), 'w') as finaltraining: