diff --git a/CONTRIBUTORS.yaml b/CONTRIBUTORS.yaml index 8a9a770e48130e..953101328423cb 100644 --- a/CONTRIBUTORS.yaml +++ b/CONTRIBUTORS.yaml @@ -1765,6 +1765,11 @@ nagoue: email: nadia.goue@uca.fr orcid: 0000-0003-2750-1473 joined: 2019-07 + +Najatamk: + name: Najat Amoukou + email: najatibrahim21@gmail.com + joined: 2024-07 nakucher: name: Natalie Kucher diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/images/image.PNG b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image.PNG new file mode 100644 index 00000000000000..399442e544e5b3 Binary files /dev/null and b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image.PNG differ diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/images/image1.png b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image1.png new file mode 100644 index 00000000000000..ea2bb9b0ec24a9 Binary files /dev/null and b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image1.png differ diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/images/image2.png b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image2.png new file mode 100644 index 00000000000000..073d34fd80c1d7 Binary files /dev/null and b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image2.png differ diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/images/image3.png b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image3.png new file mode 100644 index 00000000000000..a50312bca2f520 Binary files /dev/null and b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image3.png differ diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/images/image4.png b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image4.png new file mode 100644 index 00000000000000..34be74c32a85c0 Binary files /dev/null and b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image4.png differ diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/images/image5.png b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image5.png new file mode 100644 index 00000000000000..7a2d7a52b77f15 Binary files /dev/null and b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image5.png differ diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/images/image7.PNG b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image7.PNG new file mode 100644 index 00000000000000..399442e544e5b3 Binary files /dev/null and b/topics/ecology/tutorials/ENA_Biodiv_submission/images/image7.PNG differ diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/images/imageI.png b/topics/ecology/tutorials/ENA_Biodiv_submission/images/imageI.png new file mode 100644 index 00000000000000..835aefc6cd0bf8 Binary files /dev/null and b/topics/ecology/tutorials/ENA_Biodiv_submission/images/imageI.png differ diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/metadata_GdBqCOI_ERC000011_Test.xlsx b/topics/ecology/tutorials/ENA_Biodiv_submission/metadata_GdBqCOI_ERC000011_Test.xlsx new file mode 100644 index 00000000000000..6084e6cb9dfba1 Binary files /dev/null and b/topics/ecology/tutorials/ENA_Biodiv_submission/metadata_GdBqCOI_ERC000011_Test.xlsx differ diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/tutorial.bib b/topics/ecology/tutorials/ENA_Biodiv_submission/tutorial.bib new file mode 100644 index 00000000000000..9206b0b6e4cae4 --- /dev/null +++ b/topics/ecology/tutorials/ENA_Biodiv_submission/tutorial.bib @@ -0,0 +1,42 @@ + +# This is the bibliography file for your tutorial. +# +# To add bibliography (bibtex) entries here, follow these steps: +# 1) Find the DOI for the article you want to cite +# 2) Go to https://doi2bib.org and fill in the DOI +# 3) Copy the resulting bibtex entry into this file +# +# To cite the example below, in your tutorial.md file +# use {% cite Batut2018 %} +# +# If you want to cite an online resourse (website etc) +# you can use the 'online' format (see below) +# +# You can remove the examples below + +@article{Batut2018, + doi = {10.1016/j.cels.2018.05.012}, + url = {https://doi.org/10.1016/j.cels.2018.05.012}, + year = {2018}, + month = jun, + publisher = {Elsevier {BV}}, + volume = {6}, + number = {6}, + pages = {752--758.e1}, + author = {B{\'{e}}r{\'{e}}nice Batut and Saskia Hiltemann and Andrea Bagnacani and Dannon Baker and Vivek Bhardwaj and + Clemens Blank and Anthony Bretaudeau and Loraine Brillet-Gu{\'{e}}guen and Martin {\v{C}}ech and John Chilton + and Dave Clements and Olivia Doppelt-Azeroual and Anika Erxleben and Mallory Ann Freeberg and Simon Gladman and + Youri Hoogstrate and Hans-Rudolf Hotz and Torsten Houwaart and Pratik Jagtap and Delphine Larivi{\`{e}}re and + Gildas Le Corguill{\'{e}} and Thomas Manke and Fabien Mareuil and Fidel Ram{\'{i}}rez and Devon Ryan and + Florian Christoph Sigloch and Nicola Soranzo and Joachim Wolff and Pavankumar Videm and Markus Wolfien and + Aisanjiang Wubuli and Dilmurat Yusuf and James Taylor and Rolf Backofen and Anton Nekrutenko and Bj\"{o}rn Gr\"{u}ning}, + title = {Community-Driven Data Analysis Training for Biology}, + journal = {Cell Systems} +} + +@online{gtn-website, + author = {GTN community}, + title = {GTN Training Materials: Collection of tutorials developed and maintained by the worldwide Galaxy community}, + url = {https://training.galaxyproject.org}, + urldate = {2021-03-24} +} diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/tutorial.md b/topics/ecology/tutorials/ENA_Biodiv_submission/tutorial.md new file mode 100644 index 00000000000000..baa7c2b7601030 --- /dev/null +++ b/topics/ecology/tutorials/ENA_Biodiv_submission/tutorial.md @@ -0,0 +1,393 @@ +--- +layout: tutorial_hands_on + +title: Data submission using ENA upload Tool +questions: +- How to prepare sequences for submission to ENA? +- How to upload raw sequences to ENA? +objectives: +- Manage sequencing files (ab1, FASTQ, FASTA, FASTQ.GZ) +- Clean sequences in an automated and reproducible manner +- Perform alignments for each sequence +- Have the necessary sequence format to submit to ENA +- Submit raw reads to ENA using the ENA upload Tool +time_estimation: 2h +key_points: +- Clean raw ab1 sequences and compare filtered sequences to NCBI nucleotidic database +- Submit cleaned and unique sequences to European Nucleotide Archive (ENA) resource +contributions: + authorship: + - Najatamk + - yvanlebras + testing: + - PaulineSGN + - yvanlebras + funding: + - pndb + +subtopic: ecologymetadatamgt +--- + + +This tutorial will guide you through the necessary steps to manage and prepare sequencing files (ab1, FASTQ, FASTA) for submission to the genomic database ENA. +This workflow will take you from raw sequences in AB1 format through all the necessary steps to integrate these sequences into the ENA genomic database. We will convert the files into FASTQ and FASTA formats after performing quality control. +Additionally, we will perform alignments with the NCBI database to ensure the accuracy of your sequences.You will then need to fill a metadata Excel template to use the ENA upload Tool. +The worklow is made of 17 Galaxy tools, we will present them and explain what they do. +The goal is to present an accessible and reproductible workflow for data submission. + +> +> +> In this tutorial, we will cover: +> +> 1. TOC +> {:toc} +> +{: .agenda} + +# Prepare raw data + +> Data Upload +> +> 1. **Create a new history** for this tutorial +> +> {% snippet faqs/galaxy/histories_create_new.md %} +> +> 2. **Import** the raw sequences files. +> +> ``` +> https://data.indores.fr/api/access/datafile/3673 +> https://data.indores.fr/api/access/datafile/3609 +> ``` +> +> {% snippet faqs/galaxy/datasets_import_via_link.md %} +> +> 3. **Rename** {% icon galaxy-pencil %} your datafiles +> - `3673` becomes `A2_RC_8F2_B.pl_HCOI.ab1` +> - `3609` becomes `A12_RC_9G4_B.md_HCOI.ab1` +> +> {% snippet faqs/galaxy/datasets_rename.md %} +> +> 4. **Check the datatype** +> - Make sure it is `ab1`, and change it if not. +> +> {% snippet faqs/galaxy/datasets_change_datatype.md %} +> +> 5. **Build a Collection** containing these two files, you can ame i "ab1" for example +> +> {% snippet faqs/galaxy/collections_build_list.md %} +> +{: .hands_on} + + +## Tools used in the "Prepare Data submission" Workflow + +Following steps take as input ab1 sequences files and produce filtered FastQ and Fasta files so sequences passing the quality checks are compared to NCBI nucleotidic database using Blastn operation. + +### Converting Ab1 files to FASTQ + +> ab1 to FASTQ converter +> +> 1. {% tool [ab1 to FASTQ converter](toolshed.g2.bx.psu.edu/repos/ecology/ab1_fastq_converter/ab1_fastq_converter/1.20.0) %} with the following parameters: +> - {% icon param-collection %} *"Input ab1 file"*: `ab1` data collection created at the previous step +> +{: .hands_on} + + +### Quality Control + +We are doing a first Quality control on the raw files using FastQC and MultiQC. + +> FastQC +> 1. {% tool [FastQC](toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.74+galaxy0) %} with the following parameters: +> - {% icon param-file %} *"Raw read data from your current history"*: `ab1.fastq` data collection created at the previous step +> +> 2. {% tool [MultiQC](toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.11+galaxy1) %} with the following parameters: +> - In *"Results"*: +> - {% icon param-repeat %} *"Insert Results"* +> - *"Which tool was used generate logs?"*: `FastQC` +> - In *"FastQC output"*: +> - {% icon param-file %} *"RawData FastQC output"*: `FastQC on collection X:` data collection created at the previous step +> +> 3. *Check on the HTML files the general quality statistics of your sequences* +> +{: .hands_on} + + +> Question +> +> 1. What is the quality of your sequences? +> 2. Do you have adapters? +> +> > +> > +> > 1. Quality is quite good looking at the "status checks" section of MultiQC. As expected (because here we only have one sequence by file) "Per base sequence Content" and "Overrepresented sequences" Sections are "bad" for both sequences files. "adapter content" section also show a "bad" result for A2_RC_8F2_B.pl_HCOI.ab1 file. +> > +> > 2. A2_RC_8F2_B.pl_HCOI.ab1 file seems to have adapters in it. +> > +> {: .solution} +{: .question} + +# Cleaning the Data + +## Cutadapt + +Cutadapt enables the removal of adapters, polyA tails, and other artifacts from sequences. The tool also filters reads based on quality. + +> Cutadapt +> +> 1. {% tool [Cutadapt](toolshed.g2.bx.psu.edu/repos/lparsons/cutadapt/cutadapt/4.8+galaxy0) %} with the following parameters: +> - {% icon param-collection %} *"FASTQ/A file"*: the collection with your data (output of {% icon tool %} **ab1 to FastQ converter**) +> - **"Single-end or Paired-end reads?"**: `Single-end` +> - In **"Other Read Trimming Options"**: +> - **"Quality cutoff(s) (R1)"**: `30` +> - **"Shortening reads to a fixed length"**: `Disabled` +> +> > Suggestions +> > +> > You may consider changing these parameters depending on the quality of your dataset. +> {: .comment} +> +{: .hands_on} + +> Quality Control +> +> We do a second quality control similar to the first one to check the quality of the sequences after cleaning them. +{: .comment} + + +## Quality Control with FastQC and MultiQC + +> FastQC +> +> 1. {% tool [FastQC](toolshed.g2.bx.psu.edu/repos/devteam/fastqc/fastqc/0.74+galaxy0) %} with the following parameters: +> - {% icon param-collection %} *"Raw read data from your current history"*: output from {% icon tool%} **Cutadapt** +> +> 2. {% tool [MultiQC](toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.11+galaxy1) %} with the following parameters: +> - In *"Results"*: +> - *"Which tool was used generate logs?"*: `FastQC` +> - {% icon param-repeat %} *"Insert FastQC output"* +> - {% icon param-collection %} *"FastQC output"*: the `raw` output from {% icon tool %} **FastQC** +> +> > Comment +> > +> > You should notice an improvement on the quality of your sequences. +> {: .comment} +> +{: .hands_on} + +## Filtering the collection + + +> Filter empty datasets +> +> 1. {% tool [Filter empty datasets](__FILTER_EMPTY_DATASETS__) %} with the following parameters +> - {% icon param-collection %} *"Input Collection"*: output collection from Cutadapt step +> +> 2. {% tool [FASTQ Groomer](toolshed.g2.bx.psu.edu/repos/devteam/fastq_groomer/fastq_groomer/1.1.5+galaxy2) %} with the following parameters: +> - {% icon param-collection %} *"File to groom"* : output collection from the {% icon tool %} **Filter empty datasets** +> +> This step is notably there to produce "standardized" fastqsanger sequences files so we can then use other tools accepting only such data format. +> +> 3. {% tool [Filter FASTQ](toolshed.g2.bx.psu.edu/repos/devteam/fastq_filter/fastq_filter/1.1.5) %} with the following parameters: +> - *"FASTQ File"*: output collecton from {% icon tool %} **FastQ Groomer** +> - *"Minimum size"*: `300` +> +> > Comment +> > +> > Here we descide to only keep sequences of 300bp or above, you may change this parameter depending on your dataset +> {: .comment} +> +{: .hands_on} + + +### Changing files names + +> Extract element identifiers and remove extensions +> +> 1. {% tool [Extract element identifiers](toolshed.g2.bx.psu.edu/repos/iuc/collection_element_identifiers/collection_element_identifiers/0.0.2) %} +> - {% icon param-collection %} *"Dataset collection"*: output from the previous step +> +> 2. {% tool [Regex Find And Replace](toolshed.g2.bx.psu.edu/repos/galaxyp/regex_find_replace/regex1/1.0.3) %} with the following parameters: +> - *"Select lines from"*: output of the previous step +> - In *"Check"*: +> - {% icon param-repeat %} *"Insert Check"* +> - *"Find Regex"*: `.ab1` +> - *"Replacement"*: `` +> +> > Comment +> > +> > This is to ensure that all your files names end with .fastq.gz +> {: .comment} +> +> 3. {% tool [Paste](Paste1) %} with the following parameters: +> - {% icon param-file %} *"Paste"*: the file from {% icon tool %} **Extract element identifiers** +> - {% icon param-file %} *"and"*: the file from {% icon tool %} **Regex Find And Replace** +> - {% icon param-select %} *"Delimited by"*: Tab +> +> 4. **Check the datatype** +> - should be 'tabular'. If not, change it now. +> +> {% snippet faqs/galaxy/datasets_change_datatype.md %} +> +{: .hands_on} + + + +> Relabel identifiers +> +> 1. {% tool [Relabel identifiers](__RELABEL_FROM_FILE__) %} with the following parameters: +> - {% icon param-collection %} *"Input Collection"*: output from {% icon tool %} **Filter FastQ** +> - *"How should the new labels be specified?"*: `Map original identifiers to new ones using a two column table.` +> +{: .hands_on} + + +## Alignments on NCBI database + +> NCBI BLAST alignment +> +> 1. {% tool [FASTQ to FASTA](toolshed.g2.bx.psu.edu/repos/devteam/fastqtofasta/fastq_to_fasta_python/1.1.5) %} with the following parameters: +> - {% icon param-collection %} *"Input FASTQ File"*: output collection from {% icon tool %} **Relabel Identifiers** +> +> 2. {% tool [NCBI BLAST+ blastn](toolshed.g2.bx.psu.edu/repos/devteam/ncbi_blast_plus/ncbi_blastn_wrapper/2.14.1+galaxy2) %} with the following parameters: +> - {% icon param-collection %} Nucleotide query sequence(s): output from the previous step +> - *"Subject database/sequences"*: `Locally installed BLAST database` +> - *"Nucleotide BLAST database"*: `NCBI NT (01 Sep 2023)` +> - *"Output format"*: `Tabular (extended 25 columns)` +> - *"Advanced Options"*: `Hide Advanced Options` +{: .hands_on} + + +> Extracting best hits +> +> 1. {% tool [Unique](toolshed.g2.bx.psu.edu/repos/bgruening/unique/bg_uniq/0.3) %} with the following parameters: +> - {% icon param-collection %} *"File to scan for unique values"*: output from the previous step +> - *"Advanced Options"*: `Show Advanced Options` +> - *"Column start"*: `c1` +> - *"Column end"*: `c1` +> +{: .hands_on} + +## Workflow Outputs + +1. **Collection of raw FASTQ files:** Input AB1 files converted into FASTQ files. + +2. **Collection of FASTQ files (after quality control)**: Renamed Fastq files ready for submission after quality control and filtering. + +3. **Collection of FASTA files**: FASTQ files converted into FASTA format. Used for conducting BLAST alignments. + +4. **FastQC Quality Control Results** before and after cleaning: Both raw FastQC results and HTML reports are created + +5. **MultiQC Quality Control Results** before and after cleaning: Both raw MultiQC statistics and HTML report are created + +6. **Raw Blast Results**: Results of BLAST alignments conducted on our sequences. Columns names are: + + | Column | NCBI name | Description | + |--------|--------------|------------------------------------------------| + | 1 | qaccver | Query accession dot version | + | 2 | saccver | Subject accession dot version (database hit) | + | 3 | pident | Percentage of identical matches | + | 4 | length | Alignment length | + | 5 | mismatch | Number of mismatches | + | 6 | gapopen | Number of gap openings | + | 7 | qstart | Start of alignment in query | + | 8 | qend | End of alignment in query | + | 9 | sstart | Start of alignment in subject (database hit) | + | 10 | send | End of alignment in subject (database hit) | + | 11 | evalue | Expectation value (E-value) | + | 12 | bitscore | Bit score | + | 13 | sallseqid | All subject Seq-id(s), separated by a ';' | + | 14 | score | Raw score | + | 15 | nident | Number of identical matches | + | 16 | positive | Number of positive-scoring matches | + | 17 | gaps | Total number of gaps | + | 18 | ppos | Percentage of positive-scoring matches | + | 19 | qframe | Query frame | + | 20 | sframe | Subject frame | + | 21 | qseq | Aligned part of query sequence | + | 22 | sseq | Aligned part of subject sequence | + | 23 | qlen | Query sequence length | + | 24 | slen | Subject sequence length | + | 25 | salltitles | All subject title(s), separated by a '<>' | + +7. **Filtered Blast Results** +Files containing the closest homologous sequences. + +8. **Collection of Fastq files** +Contains filtered sequences. + +# How to use ENA upload Tool + +## Adding ENA "Webin" credentials to your Galaxy user information + +> Having an ENA Submission Account +> +> Make sure you have a submission account with the European Nucleotide Archive (ENA). You will need the identifier and the password, available through https://www.ebi.ac.uk/ena/submit/webin/login. +> +{: .comment} + + +> Add your "WEBIN" credentials to your Galaxy account +> **Instructions:** +> - From the Menu, click on "User" > "Preferences". Click on "Manage Information". Scroll down to "Your ENA Webin account details" and enter your ENA "Webin" identifier and password. +> ![Adding ENA Webin credentials](./images/imageI.png) +{: .hands_on} + +## Submitting using a metadata template file + +For this tutorial we will use the ENA default sample checklist. + +![Excel Metadata template](./images/image2.png) + +**Note:** It is crucial to fill in all the fields marked "Mandatory" and ensure that the sequence names match exactly those indicated in the Excel file. + +> ENA Metadata Templates +> +> You can find metadata templates for each checklist in the [ELIXIR-Belgium GitHub repository](https://github.com/ELIXIR-Belgium/ENA-metadata-templates) +> +> 1. Direct download link of the [ENA default sample checklist]( https://github.com/ELIXIR-Belgium/ENA-metadata-templates/raw/main/templates/ERC000011/metadata_template_ERC000011.xlsx) +> +> 2. Direct download link of the [ENA default sample checklist filled with elements for the training](https://github.com/galaxyproject/training-material/raw/24776cf161e38ac0449755749d23e851400020aa/topics/ecology/tutorials/ENA_Biodiv_submission/metadata_GdBqCOI_ERC000011_Test.xlsx) +> +> You will need to import this file into your Galaxy history. Then, use the ENA Upload Tool to proceed with the submission. +> +{: .comment} + + + +> Excel Metadata Template +> +> 1. Import the ENA default sample checklist file. +> +> ``` +> https://github.com/galaxyproject/training-material/raw/24776cf161e38ac0449755749d23e851400020aa/topics/ecology/tutorials/ENA_Biodiv_submission/metadata_GdBqCOI_ERC000011_Test.xlsx +> ``` +> +> {% snippet faqs/galaxy/datasets_import_via_link.md %} +> +> 2. {% tool [ENA Upload tool](toolshed.g2.bx.psu.edu/repos/iuc/multiqc/multiqc/1.11+galaxy1) %} with the following parameters: +> - *"Action to execute"*: `Add new (meta)data` +> - *"Select the metadata input method"*: `Excel file` +> - *"Select the ENA sample checklist"*: `ENA default sample checklist (ERC000011)` +> - *"Select Excel file based on template"*: `metadata_GdBqCOI_ERC000011_Test.xlsx` +> - *"Select input data"*: `Dataset or dataset collection` +> - *"Add .fastq (.gz, .bz2) extension to the Galaxy dataset names to match the ones described in the input tables?"*: `Yes` +> +> > Datatype +> > +> > The ENA upload tool will then automatically compress fastq sequences files into .fastq.gz format before submission +> {: .comment} +> +> > Danger: Submit to ENA test server! +> > We suggest you first submit to the [ENA test server](https://wwwdev.ebi.ac.uk/ena/submit/webin/) before making a public submission! Submission can be seen in `Dashboard/Study Report` +> {: .warning} +> +> ![ENA Upload tool](./images/image3.png) +> +{: .hands_on} + + + +# Conclusion + +This tutorial guides you through quality check and preparing raw data files for ENA submission. You can then verify that your sequences have been successfully sent by logging into the Test ENA portal (https://wwwdev.ebi.ac.uk/ena/submit/webin/login) and navigating to the Study Report section. diff --git a/topics/ecology/tutorials/ENA_Biodiv_submission/workflows/index.md b/topics/ecology/tutorials/ENA_Biodiv_submission/workflows/index.md new file mode 100644 index 00000000000000..e092e0ae66ddd4 --- /dev/null +++ b/topics/ecology/tutorials/ENA_Biodiv_submission/workflows/index.md @@ -0,0 +1,3 @@ +--- +layout: workflow-list +---