-
Notifications
You must be signed in to change notification settings - Fork 3
/
extend_align.sh
183 lines (164 loc) · 7.49 KB
/
extend_align.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#$ -V
#$ -cwd
#$ -S /bin/bash
#$ -N cPer_bat1k
#$ -o $JOB_NAME.o$JOB_ID
#$ -e $JOB_NAME.e$JOB_ID
#$ -q omni
#$ -pe sm 1
#$ -P quanah
####USAGE and OUTPUT
# qsub extend_align.sh <path to genome.gz> <path to output directory> <path to repeatmodeler queries>
#NOTE: do not use relative paths. Haven't figured out how to handle those yet.
#
# This script will take in queries generated by RepeatModeler and generate
#extended consensus sequences for visualization and evaluation.
#
## Input #1 = a genome file, zipped. The first field, SUBNAME, as divided by "." will be used
# as a designation
# Input #2 = the path to the output directory
# Input #3 = path to a file with repeatmodeler consensuses with headers modified to remove "#" and "/"
#
# Example - sh bin/cPer_bat1k_extend_align_svaca.sh assembly/cPer_bat1k.masked.fa.gz /lustre/scratch/daray/test2/cPer repeatmodeler/carollia_short.fa
#
## Output by directory (extend_align/SUBNAME...):
# blastfiles = blast output, database, and queries file
# extendlogs = log files from Hubley extension tool
# extensionwork = directory for each TE evaluated with output from tool
# extract_align = output from extract_align.py, contains catTEfiles to be used by extension tool
# genomefiles = genome assembly in .fa and .2bit format
# images_and_alignments = .png files and aligned files for visual valitation
# muscle = alignments of catTEfiles with the extension products
# rejects = alignments filtered for too few hits
# original_consensuses = individual files with the consensus suggested by RepeatModeler
# prealign = files to be aligned and deposited in muscle
######STUFF THAT MAY NEED CHANGING -- CHECK ALL#######
######CONDA OPERATING ENVIRONMENT######
#Set up conda environment
#Note: Before using this script, I set up this working enviroment within conda using:
# $ conda create --name extend_env
# $ conda activate extend_env
# $ conda install biopython
# $ conda install pandas
# $ conda install -c bioconda pyfaidx
# $ conda install --channel bioconda pybedtools
#activate conda environment, load modules
. ~/conda/etc/profile.d/conda.sh
conda activate extend_env
module load intel perl rmblast bedtools
#Locations of critical software
SOFTWARE=/lustre/work/daray/software
EXTENDPATH=$SOFTWARE/RepeatModeler-dev/util
GITPATH=/home/daray/gitrepositories/bioinfo_tools
#Variables for extract_align.py
SEQBUFFER=3000
SEQNUMBER=50
FLANK=100
######END STUFF THAT MAY NEED CHANGING######
##Get genome name info
GENOME=$1
echo "Genome file is "$GENOME
BASENAME=$(basename $GENOME .gz)
SUBNAME=$(basename $GENOME | awk -F'[.]' '{print $1}')
##Get consensus file info
CONSENSUSFILE=$3
echo "Queries file is "$CONSENSUSFILE
CONSENSUSSEQS=$(basename $CONSENSUSFILE)
echo $CONSENSUSSEQS
##Set paths
MAINPATH=$2
WORKDIR=$MAINPATH
echo $MAINPATH
mkdir -p $WORKDIR
#Subdirectory for this genome's work
THISGENOME=$WORKDIR/$SUBNAME
mkdir -p $THISGENOME/extensionwork
EXTENSIONWORK=$THISGENOME/extensionwork
mkdir -p $THISGENOME/extendlogs
EXTENDLOGS=$THISGENOME/extendlogs
mkdir -p $THISGENOME/genomefiles
GENOMEFILES=$THISGENOME/genomefiles
mkdir -p $THISGENOME/prealign
PREALIGN=$THISGENOME/prealign
mkdir -p $THISGENOME/muscle
MUSCLE=$THISGENOME/muscle
mkdir -p $THISGENOME/images_and_alignments
IMAGES=$THISGENOME/images_and_alignments
#Get genome fasta and unzip
echo "Checking genome files"
#if assembly does not exist in this directory, create it
[ ! -f $GENOMEFILES/$SUBNAME".masked.fa" ] && gunzip -c $GENOME > $GENOMEFILES/$SUBNAME".masked.fa"
#if .2bit version of the assembly does not exist, create it.
[ ! -f $GENOMEFILES/$SUBNAME".masked.2bit" ] && $SOFTWARE/faToTwoBit $GENOMEFILES/$SUBNAME".masked.fa" $GENOMEFILES/$SUBNAME".masked.2bit"
#Run blast on queries
echo "Checking blast files"
#if the blast files directory does not exist, create it
[ ! -d $THISGENOME/blastfiles ] && mkdir $THISGENOME/blastfiles
cd $THISGENOME/blastfiles
ln -s $GENOMEFILES/$SUBNAME".masked.fa"
cp $CONSENSUSFILE .
#if blast database doesn't exist, create it
[ ! -f *.nsq ] && makeblastdb -in $SUBNAME".masked.fa" -dbtype nucl
#if blast output doesn't exist, run blast
[ ! -f $SUBNAME"_blastn.out" ] && blastn -query $CONSENSUSSEQS -db $SUBNAME".masked.fa" -outfmt 6 -out $SUBNAME"_blastn.out"
#Run extract_align
echo "Running extract_align"
#check if extract_align directory exists
[ ! -d $THISGENOME/extract_align ] && mkdir $THISGENOME/extract_align
cd $THISGENOME/extract_align
pwd
ln -s $GENOMEFILES/$SUBNAME".masked.fa"
cp $CONSENSUSFILE .
#run extract_align.pl to pull as many as 50 of the best hits from the blast output out of the genome assembly. Those hits will go into catTEfiles directory.
python $GITPATH/extract_align.py -g $SUBNAME".masked.fa" -b $THISGENOME/blastfiles/$SUBNAME"_blastn.out" -l $CONSENSUSSEQS -lb $SEQBUFFER -rb $SEQBUFFER -n $SEQNUMBER -a n -e n -t n
#Run extend tool
echo "Running extension tool"
#for every file in the catTEfiles directory
for FILE in $THISGENOME/extract_align/catTEfiles/*.fa
# get the name of the TE being examined from the filename
do TEID=$(basename $FILE | awk -F'[.]' '{print $1}')
echo "TEID = "$TEID
#create a diretory for it if it doesn't already exist
[ ! -d $EXTENSIONWORK/$TEID ] && mkdir $EXTENSIONWORK/$TEID
cd $EXTENSIONWORK/$TEID
#run Robert Hubley's extension tool
$EXTENDPATH/davidExtendConsRAM.pl \
-genome $GENOMEFILES/$SUBNAME".masked.2bit" \
-family $FILE \
-div 5 \
-outdir . \
>$EXTENDLOGS/$TEID".extend.log"
#use Linup utility to get an alignment of the hits the extension tool used along with the extended consensus sequence
$EXTENDPATH/Linup -genome $GENOMEFILES/$SUBNAME".masked.2bit" -includeFlanking $FLANK -msa $EXTENSIONWORK/$TEID/out > $EXTENSIONWORK/$TEID/final.fa
#create an ungapped version of the alignment for alignment in the next step
sed '/^>/! s/-//g' $EXTENSIONWORK/$TEID/final.fa >$EXTENSIONWORK/$TEID/final_degap.fa
done
echo "Get visual representations"
#create a file to filter TEs with very few hits
mkdir -p $MUSCLE/rejects
cd $MUSCLE
#create a directory to store the original conensus sequences from RepeatModeler
mkdir -p $MUSCLE/original_consensuses
for FILE in $THISGENOME/extract_align/catTEfiles/*.fa
do TEID=$(basename $FILE | awk -F'[.]' '{print $1}')
#rename the rep file with TEID
sed "s/repam-newrep/$TEID/g" $EXTENSIONWORK/$TEID/rep >$EXTENSIONWORK/$TEID/rep_rename.fa
#prepare for alignment with all extracted hits
#concatenate the renamed rep with the extracts plus original consensus
cat $EXTENSIONWORK/$TEID/rep_rename.fa $FILE > $PREALIGN/$TEID".fa"
#run muscle on the concatenated file
$SOFTWARE/muscle/muscle -in $PREALIGN/$TEID".fa" -out $MUSCLE/$TEID".muscle.fa" -maxiters 2
#prepare for alignment with subset of hits
#pull out original consensus
awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' $FILE | grep -A 1 "^>CONSENSUS" > $MUSCLE/original_consensuses/$TEID".original.fa"
#concatenate subset of hits with original consensus and renamed rep
cat $EXTENSIONWORK/$TEID/rep_rename.fa $MUSCLE/original_consensuses/$TEID".original.fa" $EXTENSIONWORK/$TEID/final_degap.fa > $PREALIGN/$TEID".subset.fa"
#run muscle on concatenated file
$SOFTWARE/muscle/muscle -in $PREALIGN/$TEID".subset.fa" -out $IMAGES/$TEID".subset.muscle.fa"
#set up filtering for minimum number of hits
COUNT=$(grep ">" $MUSCLE/$TEID".muscle.fa" | wc -l)
echo "Hit count = "$COUNT
if test $COUNT -lt 7; then mv $MUSCLE/$TEID".muscle.fa" $MUSCLE/rejects; fi
if test $COUNT -gt 7; then cp $EXTENSIONWORK/$TEID/img.png $IMAGES/$TEID".png"; fi
if test $COUNT -lt 7; then cp $EXTENSIONWORK/$TEID/img.png $MUSCLE/rejects/$TEID".png"; fi
done