Skip to content

Latest commit

 

History

History
88 lines (62 loc) · 7.12 KB

02_BCATPhylogeneticTree.md

File metadata and controls

88 lines (62 loc) · 7.12 KB

Create a phylogenetic tree of soybean and arabidopsis BCATS

/work/gif3/masonbrink/Baum/09_BCATsSoybeanArabidopsis

# download genome, gff3, and proteins from Phytozome

curl --cookie jgi_session=/api/sessions/340d382a06927910b68224c177625497 --output download.20240606.132508.zip -d "{\"ids\":{\"Phytozome-447\":[\"587b0adf7ded5e4229d885ab\",\"587b0ade7ded5e4229d885a8\",\"587b0add7ded5e4229d885a6\"],\"Phytozome-880\":[\"65c561da23084663654f942c\",\"65c561db23084663654f943a\",\"65c561d923084663654f941f\",\"65c561da23084663654f9430\"]}}" -H "Content-Type: application/json" https://files-download.jgi.doe.gov/filedownload/

Clean up the proteins

/work/gif3/masonbrink/Baum/09_BCATsSoybeanArabidopsis/01_Orthofinder/ProteinSeqs

less ../../Phytozome/PhytozomeV13/Gmax/Wm82.a6.v1/annotation/Gmax_880_Wm82.a6.v1.protein.fa |awk '{print $1}' |sed 's/\*//g' >CleanSoybeanProteins.fasta
less ../../Phytozome/PhytozomeV12/early_release/Athaliana_447_Araport11/annotation/Athaliana_447_Araport11.protein.fa |awk '{print $1}' |sed 's/\*//g' >CleanArabidopsisProteins.fasta

run orthofinder

ml orthofinder/2.5.4-py310-4cstbgx; orthofinder -f ProteinSeqs/ -t 36 -a 35 -n Orthofinder

Identify which proteins are BCAT

#Found these in NCBI protein based on key word search. 

>AEE28538.1 branched-chain amino acid transaminase 2 [Arabidopsis thaliana]
MIKTITSLRKTLVLPLHLHIRTLQTFAKYNAQAASALREERKKPLYQNGDDVYADLDWDNLGFGLNPADYMYVMKCSKDGEFTQGELSPYGNIQLSPSAGVLNYGQAIYEGTKAYRKENGKLLLFRPDHNAIRMKLGAERMLMPSPSVDQFVNAVKQTALANKRWVPPAGKGTLYIRPLLMGSGPILGLGPAPEYTFIVYASPVGNYFKEGMAALNLYVEEEYVRAAPGGAGGVKSITNYAPVLKALSRAKSRGFSDVLYLDSVKKKYLEEASSCNVFVVKGRTISTPATNGTILEGITRKSVMEIASDQGYQVVEKAVHVDEVMDADEVFCTGTAVVVAPVGTITYQEKRVEYKTGDESVCQKLRSVLVGIQTGLIEDNKGWVTDIN
>AED98106.1 branched-chain amino acid aminotransferase 5 / branched-chain amino acid transaminase 5 (BCAT5) [Arabidopsis thaliana]
MERSAVASGFHRNYILCASRAATSTTRLHSLSSLRNFPSSSLRIRHCPSPISSNFIVSEVSRNRRCDAVSSSTTDVTELAEIDWDKIDFGLKPTDYMYAMKCSRDGEFSQGQLQPFGNIDINPAAGVLNYGQGLFEGLKAYRKQDGNILLFRPEENAIRMRNGAERMCMPSPTVEQFVEAVKTTVLANKRWIPPPGKGSLYIRPLLMGTGAVLGLAPAPEYTFLIFVSPVGNYFKEGVAPINLIVETEFHRATPGGTGGVKTIGNYAAVLKAQSIAKAKGYSDVLYLDCLHKRYLEEVSSCNIFIVKDNVISTPEIKGTILPGITRKSIIEVARSQGFKVEERNVTVDELVEADEVFCTGTAVVLSPVGSITYKSQRFSYGEDGFGTVSKQLYTSLTSLQMGLSEDNMNWTVQLS
>AEE28535.1 branched-chain amino acid transaminase 1 [Arabidopsis thaliana]
MALRRCLPQYSTTSSYLSKIWGFRMHGTKAAASVVEEHVSGAEREDEEYADVDWDNLGFSLVRTDFMFATKSCRDGNFEQGYLSRYGNIELNPAAGILNYGQGLIEGMKAYRGEDGRVLLFRPELNAMRMKIGAERMCMHSPSVHQFIEGVKQTVLANRRWVPPPGKGSLYLRPLLFGSGASLGVAAASEYTFLVFGSPVQNYFKEGTAALNLYVEEVIPRAYLGGTGGVKAISNYGPVLEVMRRAKSRGFSDVLYLDADTGKNIEEVSAANIFLVKGNTIVTPATSGTILGGITRKSIIEIALDLGYKVEERSVPVEELKEAEEVFCTGTAAGVASVGSITFKNTRTEYKVGDGIVTQQLRSILVGIQTGSIQDTKDWVLQIA
>sp|Q9M401.1|BCAT3_ARATH RecName: Full=Branched-chain-amino-acid aminotransferase 3, chloroplastic; Short=Atbcat-3; Flags: Precursor
MERAAILPSVNQNYLLCPSRAFSTRLHSSTRNLSPPSFASIKLQHSSSSVSSNGGISLTRCNAVSSNSSSTLVTELADIDWDTVGFGLKPADYMYVMKCNIDGEFSKGELQRFGNIEISPSAGVLNYGQGLFEGLKAYRKKDGNNILLFRPEENAKRMRNGAERMCMPAPTVEQFVEAVTETVLANKRWVPPPGKGSLYVRPLLMGTGAVLGLAPAPEYTFIIYVSPVGNYFKEGVAPINLIVENEFHRATPGGTGGVKTIGNYAAVLKAQSIAKAKGYSDVLYLDCIYKRYLEEVSSCNIFIVKDNVISTPEIKGTILPGITRKSMIDVARTQGFQVEERNVTVDELLEADEVFCTGTAVVVSPVGSVTYKGKRVSYGEGTFGTVSKQLYTVLTSLQMGLIEDNMKWTVNLS
>sp|Q9LE06.1|BCAT4_ARATH RecName: Full=Methionine aminotransferase BCAT4; AltName: Full=Branched-chain-amino-acid aminotransferase 4; Short=Atbcat-4; AltName: Full=Methionine-oxo-acid transaminase BCAT4
MAPSAQPLPVSVSDEKYANVKWEELAFKFVRTDYMYVAKCNHGESFQEGKILPFADLQLNPCAAVLQYGQGLYEGLKAYRTEDGRILLFRPDQNGLRLQAGADRLYMPYPSVDQFVSAIKQVALANKKWIPPPGKGTLYIRPILFGSGPILGSFPIPETTFTAFACPVGRYHKDNSGLNLKIEDQFRRAFPSGTGGVKSITNYCPVWIPLAEAKKQGFSDILFLDAATGKNIEELFAANVFMLKGNVVSTPTIAGTILPGVTRNCVMELCRDFGYQVEERTIPLVDFLDADEAFCTGTASIVTSIASVTFKDKKTGFKTGEETLAAKLYETLSDIQTGRVEDTKGWTVEIDRQG
>sp|Q9LPM9.1|BCAT6_ARATH RecName: Full=Branched-chain-amino-acid aminotransferase 6; Short=Atbcat-6
MAPSSSPLRTTSETDEKYANVKWEELGFALTPIDYMYVAKCRQGESFTQGKIVPYGDISISPCSPILNYGQGLFEGLKAYRTEDDRIRIFRPDQNALRMQTGAERLCMTPPTLEQFVEAVKQTVLANKKWVPPPGKGTLYIRPLLLGSGATLGVAPAPEYTFLIYASPVGDYHKVSSGLNLKVDHKYHRAHSGGTGGVKSCTNYSPVVKSLLEAKSAGFSDVLFLDAATGRNIEELTACNIFIVKGNIVSTPPTSGTILPGVTRKSISELAHDIGYQVEERDVSVDELLEAEEVFCTGTAVVVKAVETVTFHDKKVKYRTGEAALSTKLHSMLTNIQMGVVEDKKGWMVDIDPCQG
>sp|Q9LPM8.1|BCAT7_ARATH PUTATIVE PSEUDOGENE: RecName: Full=Putative branched-chain-amino-acid aminotransferase 7; Short=Atbcat-7
MAPSVHPSSSPLFTSKADEKYANVKWDELGFALVPTDYMYVAKCKQGESFSTGEIVPYGDISISPCAGILNYGQGLFEGLKAYRTEDGRITLFRPDQNAIRMQTGADRLCMTPPSPEQFVEAVKQTVLANNKWVPPPGKGALYIRPLLIGTGAVLGVASAPEYTFLIYTSPVGNYHKASSGLNLKVDHNHRRAHFGGTGGVKSCTNYSPVVKSLIEAKSSGFSDVLFLDAATGKNIEEVSTCNIFILKGNIVSTPPTSGTILPGITRKSICELARDIGYEVQERDLSVDELLEAEEVFCTGTAVVIKAVETVTFHDKRVKYRTGEEAFSTKLHLILTNIQMGVVEDKKGWMMEIDHLVGTDSFPDET
>sp|Q9ASR4.1|BCAL2_ARATH RecName: Full=Branched-chain-amino-acid aminotransferase-like protein 2
MEVIHAWSAPRSLSTTLMYSFAQRDDIEVLDEPLYAAFLKSTGVDRPYKDELLSKMECDGEKVVKDIIYGPGKKKYRFCKHISKQRLLGLPSELMSEGKHFILIRNPLNILPSFEKIHPSSFHELGLGELVSIYSDLCQMGTPPAIIDADELQRDPEATLRSLCDDLEIPFQASMLKWEAGPIPEDGLWAPWWYETLHKSTGFSSPQKYPQTFPLMHYDLLEQCLPLYNILRCHMKHKSSLLSSTLPPPSLPVPENAKLLAWVGDEIVPREMAKVSVFDSVVQGGDSVWEGLRIYKGKVFKLEEHLDRLSDSAKALAFNNVPTREEIKEAIFRTLITNGMFDNTHIRLSLTRGKKVTSGMSPAFNRYGCTLIVLAEWKPPVYDNDGGIVLVTATTRRNSPNNLDSKIHHNNLLNNILAKIESNNANVDDAIMLDKDGFVSETNATNIFMVKKDRVLTPHADYCLPGITRATVMELVVKENFILEERRISLSEFHTADEVWTTGTMGELSPVVKIDGRVIGEGKVGPVTRRLQNAYKKLTDGSGVPIPTYQEVKNLEPCV

#use these to blast soybean and arabidopsis gene predictions to get the correct genes. 

BLAST

blastp -query AraBCATQUERY.fasta -db AllProteins.fasta -outfmt 6 -num_threads 6 -out BCA2Predictions.blastout
# minimum score of 100 or 300 didnt matter. still 49 seqs
awk '$12>300 ' BCA2Predictions.blastout |awk '{print $2}' |sort |uniq |cdbyank AllProteins.fasta.cidx >AllBCATs.fasta

# they didnt want isoforms, so removed all secondary isoforms except the gene that was most identical to Allies protein interactor Glyma.06G050100.3.p
less BCATGeneListAllIsoforms.list |cdbyank AllProteins.fasta.cidx >OneIsoformBCATProteins.fasta

MSA

ml  clustal-omega/1.2.4-qlv4yt4

clustalo -i AllBCATs.fasta -t protein -o MSAAllBCATs.alignment --threads 35

clustalo -i OneIsoformBCATProteins.fasta -t protein -o OneIsoformBCATProteins.alignment --threads 35

Create the TREE

# initial tree using fasttree
FastTree MSAAllBCATs.alignment >BCATtree.tree


# best ML tree 
/opt/rit/el9/20230413/app/linux-rhel9-x86_64_v3/gcc-11.2.1/raxml-8.2.12-szg7bbvlzo77znkehlxjzfac444ojjwr/bin/raxmlHPC -s OneIsoformBCATProteins.alignment -n SingleIsoform2 -m PROTGAMMAAUTO -p 84381764921 


# bootstrapped tree with raxml 
ml raxml/8.2.12-szg7bbv; /opt/rit/el9/20230413/app/linux-rhel9-x86_64_v3/gcc-11.2.1/raxml-8.2.12-szg7bbvlzo77znkehlxjzfac444ojjwr/bin/raxmlHPC -s OneIsoformBCATProteins.alignment -n SingleIsoform3 -m PROTGAMMAAUTO -p 84381764921 --bootstop-perms=1000 -b 12345 -# 1000

# attach the bootstraps tot he best ML tree
ml raxml/8.2.12-szg7bbv; raxmlHPC -f b -t RAxML_bestTree.SingleIsoform2 -z  RAxML_bootstrap.SingleIsoform3 -n final_tree_with_bootstrap -m PROTGAMMAAUTO