-
Notifications
You must be signed in to change notification settings - Fork 1
/
BB_BuildTrans1.sh
executable file
·217 lines (199 loc) · 7.88 KB
/
BB_BuildTrans1.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/bin/sh
##----INTRO-----------##
# Name=Nov14.BuildTrans
# Date=Nov 14,2015
# Update=Nov 16,2015
########################
# Purpose
#This file is for build new transcriptome by RNA-seq.
#1.blat+psl filter(python)
###process with trinity_out_psl_file, fileter psl and get intron information.
#2.STAR+SAM/BAM+cufflinks
###RNAseq normal mapping and junction mapping, and assemble into transcript.
#######--Arguments--#######
help_info(){
echo "usage:"
echo "sh Nov14.BuildTrans.sh <option>* [-1 sample_1.fq] [-2 sample_2.fq] [-a sample.fa] [-l read_len] [-s species] [-n sample_name]"
echo ""
echo "Arguments:"
echo "-1 The first fastq file for paried-end, or the fastq file for single-end"
echo "-2 The second fastq file if sample is paried-end"
echo "-a Trinity result fatsa file"
echo "-s Species used for getting the right annotation files"
echo "-n Sample name"
echo "-o Overhang of cufflink arguement, usually read length-1, default: 100"
echo "-C CPU number used for the pipeline"
echo "-m minfrags of cufflink arguement, default: 10"
echo ""
echo "This file is for process the result after Trinity."
echo ""
}
if [ $# -lt 1 ];then
help_info
exit 1
fi
samp_2_fq="NO"
CPU=1
min_frag=10
while getopts "1:2:a:s:n:l:C:m:" Arg
do
case $Arg in
1) samp_1_fq=$OPTARG;;
2) samp_2_fq=$OPTARG;;
a) samp_fa=$OPTARG;;
s) species=$OPTARG;;
n) samp_name=$OPTARG;;
l) over_hang=$OPTARG;;
C) CPU=$OPTARG;;
m) min_frag=$OPTARG;;
?) echo "Wrong parameter!!!"
exit 1;;
esac
done
# get the right annotations based on species
case $species in
mouse) anno="/data/tongji2/annotation/Mus_musculus.GRCm38.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/mm10.fa"
STAR_inde="/data/tongji2/index/mm10_star_gtf/"
bowt_inde="/data/tongji2/index/mm10_bowtie2/"
chro_size="/data/tongji2/chrom.size/mm10.chrom.size";;
rat) anno="/data/tongji2/annotation/Rattus_norvegicus.Rnor_6.0.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/rn6.fa"
STAR_inde="/data/tongji2/index/rn6_star_gtf/"
bowt_inde="/data/tongji2/index/rn6_bowtie2/"
chro_size="/data/tongji2/chrom.size/rn6.chrom.size";;
human) anno="/data/tongji2/annotation/Homo_sapiens.GRCh38.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/hg38.fa"
STAR_inde="/data/tongji2/index/hg38_star_gtf/"
bowt_inde="/data/tongji2/index/hg38_bowtie2/"
chro_size="/data/tongji2/chrom.size/hg38.chrom.size";;
cow) anno="/data/tongji2/annotation/Bos_taurus.UMD3.1.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/bosTau8.fa"
STAR_inde="/data/tongji2/index/bosTau8_star_gtf/"
bowt_inde="/data/tongji2/index/bosTau8_bowtie2/"
chro_size="/data/tongji2/chrom.size/bosTau8.chrom.size";;
rhesus) anno="/data/tongji2/annotation/Macaca_mulatta.MMUL_1.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/rheMac3.fa"
STAR_inde="/data/tongji2/index/rheMac3_star_gtf/"
bowt_inde="/data/tongji2/index/rheMac3_bowtie2/"
chro_size="/data/tongji2/chrom.size/rheMac3.chrom.size";;
chicken) anno="/data/tongji2/annotation/Gallus_gallus.Galgal4.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/galGal4.fa"
STAR_inde="/data/tongji2/index/galGal4_star_gtf/"
bowt_inde="/data/tongji2/index/galGal4_bowtie2/"
chro_size="/data/tongji2/chrom.size/galGal4.chrom.size";;
platypus) anno="/data/tongji2/annotation/Ornithorhynchus_anatinus.OANA5.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/ornAna1.fa"
STAR_inde="/data/tongji2/index/ornAna1_star_gtf/"
bowt_inde="/data/tongji2/index/ornAna1_bowtie2/"
chro_size="/data/tongji2/chrom.size/ornAna1.chrom.size";;
chimpanzee) anno="/data/tongji2/annotation/Pan_troglodytes.CHIMP2.1.4.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/panTro4.fa"
STAR_inde="/data/tongji2/index/panTro4_star_gtf/"
bowt_inde="/data/tongji2/index/panTro4_bowtie2/"
chro_size="/data/tongji2/chrom.size/panTro4.chrom.size";;
gorilla) anno="/data/tongji2/annotation/Gorilla_gorilla.gorGor3.1.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/gorGor3.fa"
STAR_inde="/data/tongji2/index/gorGor3_star_gtf/"
bowt_inde="/data/tongji2/index/gorGor3_bowtie2/"
chro_size="/data/tongji2/chrom.size/gorGor3.chrom.size";;
opossum) anno="/data/tongji2/annotation/Monodelphis_domestica.BROADO5.82_seleno_filtered_final.gtf"
geno_fa="/data/tongji2/genome/monDom5.fa"
STAR_inde="/data/tongji2/index/monDom5_star_gtf/"
bowt_inde="/data/tongji2/index/monDom5_bowtie2/"
chro_size="/data/tongji2/chrom.size/monDom5.chrom.size";;
?) echo "No Species!!!"
exit 1;;
esac
pyth_path="/home/tongji2/piRNA/Code/Tianxiong/python_code/"
shel_path="/home/tongji2/piRNA/Code/Tianxiong/shell_code/"
###########################
echo "Start!"
###--mkdir--###
mkdir $samp_name
cd $samp_name
###--Process--###
#################
#1.blat+psl filter(python)
###process with trinity_out_psl_file, fileter psl and get intron information.
#-blat-#
#from Trinity_output fasta file to psl file
# blat -q=rna -out=psl $geno_fa $samp_fa ./${samp_name}.psl
# #-Python-#
# #filter psl file with %95 fraction mapped and %1 mismatch
# python ${pyth_path}tran_xlstxt_to_txt.py ./${samp_name}.psl ./temp.psl
# mv ./temp.psl ./${samp_name}.psl
# BB_PslFilter.py ./${samp_name}.psl ./temp.psl 0.95 0.01
# mv ./temp.psl ./${samp_name}.psl
# #-Python-#
# #get intron from psl file
# BB_GetIntronFromPsl.py ./${samp_name}.psl ./${samp_name}_trinity_intron.tab
# #merge introns we got and introns annotated
# cat ./${samp_name}_trinity_intron.tab ${STAR_inde}sjdbList.out.tab | sort -u > ${samp_name}_merge_intron.tab
#################
#2.STAR+SAM/BAM+cufflinks
###RNAseq normal mapping and junction mapping, and assemble into transcript.
#-STAR-#
#splicing junction based STAR for all reads from sample.fq
#use 16 CPU, 2 mismatch, 100 max multi-map, , write all sam attributes, write unmapped reads
# mkdir ./temp_index
mkdir ./star_sj
# # build index
# STAR --runMode genomeGenerate \
# --runThreadN $CPU \
# --sjdbFileChrStartEnd ./${samp_name}_merge_intron.tab \
# --sjdbOverhang $over_hang \
# --genomeFastaFiles $geno_fa \
# --genomeDir ./temp_index/
# run STAR
if [ $samp_2_fq = 'NO' ]; then
#sangle-end
STAR --genomeDir /data/tongji2/Pipeline_get_piRNA_transcript/mouse/mouse_testes_star_index/ \
--runThreadN $CPU \
--readFilesIn $samp_1_fq \
--outFileNamePrefix ./star_sj/${samp_name} \
--outFilterMismatchNmax 2 \
--outFilterMultimapNmax 100 \
--outSAMattributes All \
--outFilterIntronMotifs RemoveNoncanonicalUnannotated
else
#paired-end
STAR --genomeDir /data/tongji2/Pipeline_get_piRNA_transcript/mouse/mouse_testes_star_index/ \
--runThreadN $CPU \
--readFilesIn $samp_1_fq $samp_2_fq \
--outFileNamePrefix ./star_sj/${samp_name} \
--outFilterMismatchNmax 2 \
--outFilterMultimapNmax 100 \
--outSAMattributes All \
--outFilterIntronMotifs RemoveNoncanonicalUnannotated
fi
# remove index file for space_free
rm -rf temp_index
#-samtools-#
#transfer sam to bam, sort.
samtools view -bhS -o ./star_sj/${samp_name}.bam ./star_sj/${samp_name}Aligned.out.sam
samtools sort ./star_sj/${samp_name}.bam ./star_sj/${samp_name}.sort
rm -rf ./star_*/*.sam
rm -rf ./star_sj/${samp_name}.bam
#-cufflink-#
#transcriptome assembling
mkdir ./cufflink
cufflinks -o ./cufflink/ \
-g $anno \
-p $CPU \
--library-type fr-firststrand \
-u -j 0.2 \
--min-frags-per-transfrag $min_frag \
--overlap-radius 250 \
./star_sj/${samp_name}.sort.bam > /dev/null 2>&1
#run cuffcompare to check the accuracy of assembled transcript
#the main performance of cuffcompare is in .states file
#and we can get the geneId/transcriptId from .transcripts.gtf.tmap (to be doing)
cuffcompare -r $anno \
-o $samp_name \
./cufflink/transcripts.gtf
#-Python-#
#transfer GTF file to bed file
#python ${pyth_path}gtfToBed_Dec2.py ./cufflink/transcripts.gtf ./${samp_name}.transcripts_assemble.bed
echo "Finish!"
#########################