extend_align.sh

#$ -V
#$ -cwd
#$ -S /bin/bash
#$ -N cPer_bat1k
#$ -o $JOB_NAME.o$JOB_ID
#$ -e $JOB_NAME.e$JOB_ID
#$ -q omni
#$ -pe sm 1
#$ -P quanah


####USAGE and OUTPUT
# qsub extend_align.sh <path to genome.gz> <path to output directory> <path to repeatmodeler queries>
#NOTE: do not use relative paths. Haven't figured out how to handle those yet.
#
# This script will take in queries generated by RepeatModeler and generate
#extended consensus sequences for visualization and evaluation.
#
## Input #1 = a genome file, zipped. The first field, SUBNAME, as divided by "." will be used 
# as a designation
# Input #2 = the path to the output directory
# Input #3 = path to a file with repeatmodeler consensuses with headers modified to remove "#" and "/"
# 
# Example - sh bin/cPer_bat1k_extend_align_svaca.sh assembly/cPer_bat1k.masked.fa.gz /lustre/scratch/daray/test2/cPer repeatmodeler/carollia_short.fa
#
## Output by directory (extend_align/SUBNAME...):
# blastfiles = blast output, database, and queries file
# extendlogs = log files from Hubley extension tool
# extensionwork = directory for each TE evaluated with output from tool
# extract_align = output from extract_align.py, contains catTEfiles to be used by extension tool
# genomefiles = genome assembly in .fa and .2bit format
# images_and_alignments = .png files and aligned files for visual valitation 
# muscle = alignments of catTEfiles with the extension products
#	rejects = alignments filtered for too few hits
#	original_consensuses = individual files with the consensus suggested by RepeatModeler
# prealign = files to be aligned and deposited in muscle

######STUFF THAT MAY NEED CHANGING -- CHECK ALL#######

######CONDA OPERATING ENVIRONMENT######
#Set up conda environment 
#Note: Before using this script, I set up this working enviroment within conda using:
# $ conda create --name extend_env
# $ conda activate extend_env
# $ conda install biopython
# $ conda install pandas
# $ conda install -c bioconda pyfaidx
# $ conda install --channel bioconda pybedtools

#activate conda environment, load modules
. ~/conda/etc/profile.d/conda.sh
conda activate extend_env
module load intel perl rmblast bedtools

#Locations of critical software
SOFTWARE=/lustre/work/daray/software
EXTENDPATH=$SOFTWARE/RepeatModeler-dev/util
GITPATH=/home/daray/gitrepositories/bioinfo_tools

#Variables for extract_align.py
SEQBUFFER=3000 
SEQNUMBER=50 
FLANK=100
######END STUFF THAT MAY NEED CHANGING######

##Get genome name info
GENOME=$1
echo "Genome file is "$GENOME
BASENAME=$(basename $GENOME .gz)
SUBNAME=$(basename $GENOME | awk -F'[.]' '{print $1}')

##Get consensus file info
CONSENSUSFILE=$3
echo "Queries file is "$CONSENSUSFILE
CONSENSUSSEQS=$(basename $CONSENSUSFILE)
echo $CONSENSUSSEQS

##Set paths
MAINPATH=$2
WORKDIR=$MAINPATH
echo $MAINPATH
mkdir -p $WORKDIR

#Subdirectory for this genome's work
THISGENOME=$WORKDIR/$SUBNAME
mkdir -p $THISGENOME/extensionwork
EXTENSIONWORK=$THISGENOME/extensionwork
mkdir -p $THISGENOME/extendlogs
EXTENDLOGS=$THISGENOME/extendlogs
mkdir -p $THISGENOME/genomefiles
GENOMEFILES=$THISGENOME/genomefiles
mkdir -p $THISGENOME/prealign
PREALIGN=$THISGENOME/prealign
mkdir -p $THISGENOME/muscle
MUSCLE=$THISGENOME/muscle
mkdir -p $THISGENOME/images_and_alignments
IMAGES=$THISGENOME/images_and_alignments

#Get genome fasta and unzip
echo "Checking genome files"
#if assembly does not exist in this directory, create it
[ ! -f $GENOMEFILES/$SUBNAME".masked.fa" ] && gunzip -c $GENOME > $GENOMEFILES/$SUBNAME".masked.fa"
#if .2bit version of the assembly does not exist, create it.
[ ! -f $GENOMEFILES/$SUBNAME".masked.2bit" ] && $SOFTWARE/faToTwoBit $GENOMEFILES/$SUBNAME".masked.fa" $GENOMEFILES/$SUBNAME".masked.2bit"

#Run blast on queries
echo "Checking blast files"
#if the blast files directory does not exist, create it
[ ! -d $THISGENOME/blastfiles ] && mkdir $THISGENOME/blastfiles
cd $THISGENOME/blastfiles
ln -s $GENOMEFILES/$SUBNAME".masked.fa"
cp $CONSENSUSFILE .
#if blast database doesn't exist, create it
[ ! -f *.nsq ] && makeblastdb -in $SUBNAME".masked.fa" -dbtype nucl
#if blast output doesn't exist, run blast
[ ! -f $SUBNAME"_blastn.out" ] && blastn -query $CONSENSUSSEQS -db $SUBNAME".masked.fa" -outfmt 6 -out $SUBNAME"_blastn.out"

#Run extract_align
echo "Running extract_align"
#check if extract_align directory exists
[ ! -d $THISGENOME/extract_align ] && mkdir $THISGENOME/extract_align
cd $THISGENOME/extract_align
pwd
ln -s $GENOMEFILES/$SUBNAME".masked.fa"
cp $CONSENSUSFILE .
#run extract_align.pl to pull as many as 50 of the best hits from the blast output out of the genome assembly. Those hits will go into catTEfiles directory. 
python  $GITPATH/extract_align.py -g $SUBNAME".masked.fa" -b $THISGENOME/blastfiles/$SUBNAME"_blastn.out" -l $CONSENSUSSEQS -lb $SEQBUFFER -rb $SEQBUFFER -n $SEQNUMBER -a n -e n -t n

#Run extend tool
echo "Running extension tool"
#for every file in the catTEfiles directory
for FILE in $THISGENOME/extract_align/catTEfiles/*.fa
	# get the name of the TE being examined from the filename
	do TEID=$(basename $FILE | awk -F'[.]' '{print $1}')
	echo "TEID = "$TEID
	#create a diretory for it if it doesn't already exist
	[ ! -d $EXTENSIONWORK/$TEID ] && mkdir $EXTENSIONWORK/$TEID
	cd $EXTENSIONWORK/$TEID
	#run Robert Hubley's extension tool
	$EXTENDPATH/davidExtendConsRAM.pl \
		-genome $GENOMEFILES/$SUBNAME".masked.2bit" \
		-family $FILE \
		-div 5 \
		-outdir . \
		>$EXTENDLOGS/$TEID".extend.log"
	#use Linup utility to get an alignment of the hits the extension tool used along with the extended consensus sequence
	$EXTENDPATH/Linup -genome $GENOMEFILES/$SUBNAME".masked.2bit" -includeFlanking $FLANK -msa $EXTENSIONWORK/$TEID/out > $EXTENSIONWORK/$TEID/final.fa
	#create an ungapped version of the alignment for alignment in the next step
	sed '/^>/! s/-//g' $EXTENSIONWORK/$TEID/final.fa >$EXTENSIONWORK/$TEID/final_degap.fa
done

echo "Get visual representations"
#create a file to filter TEs with very few hits
mkdir -p $MUSCLE/rejects
cd $MUSCLE
#create a directory to store the original conensus sequences from RepeatModeler
mkdir -p $MUSCLE/original_consensuses
for FILE in $THISGENOME/extract_align/catTEfiles/*.fa
	do TEID=$(basename $FILE | awk -F'[.]' '{print $1}')
	#rename the rep file with TEID
	sed "s/repam-newrep/$TEID/g" $EXTENSIONWORK/$TEID/rep >$EXTENSIONWORK/$TEID/rep_rename.fa
	#prepare for alignment with all extracted hits
	#concatenate the renamed rep with the extracts plus original consensus
	cat $EXTENSIONWORK/$TEID/rep_rename.fa $FILE > $PREALIGN/$TEID".fa"
	#run muscle on the concatenated file
	$SOFTWARE/muscle/muscle -in $PREALIGN/$TEID".fa" -out $MUSCLE/$TEID".muscle.fa" -maxiters 2
	#prepare for alignment with subset of hits
	#pull out original consensus
	awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);}  END {printf("\n");}' $FILE | grep -A 1 "^>CONSENSUS" > $MUSCLE/original_consensuses/$TEID".original.fa"
	#concatenate subset of hits with original consensus and renamed rep
	cat $EXTENSIONWORK/$TEID/rep_rename.fa $MUSCLE/original_consensuses/$TEID".original.fa" $EXTENSIONWORK/$TEID/final_degap.fa > $PREALIGN/$TEID".subset.fa"
	#run muscle on concatenated file
	$SOFTWARE/muscle/muscle -in $PREALIGN/$TEID".subset.fa" -out $IMAGES/$TEID".subset.muscle.fa"
	#set up filtering for minimum number of hits
	COUNT=$(grep ">" $MUSCLE/$TEID".muscle.fa" | wc -l)
	echo "Hit count = "$COUNT
	if test $COUNT -lt 7; then mv $MUSCLE/$TEID".muscle.fa" $MUSCLE/rejects; fi 
	if test $COUNT -gt 7; then cp $EXTENSIONWORK/$TEID/img.png $IMAGES/$TEID".png"; fi 
	if test $COUNT -lt 7; then cp $EXTENSIONWORK/$TEID/img.png $MUSCLE/rejects/$TEID".png"; fi
done