diff --git a/CITATION.cff b/CITATION.cff index fffb1d4..840a856 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,297 +1,297 @@ -# ----------------------------------------------------------- -# CITATION file created with {cffr} R package, v0.5.0 -# See also: https://docs.ropensci.org/cffr/ -# ----------------------------------------------------------- - -cff-version: 1.2.0 -message: 'To cite package "REcoTox" in publications use:' -type: software -title: 'REcoTox: REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII - files' -version: 0.4.0 -abstract: REcoTox is a semi-automated, interactive workflow to process US EPA ECOTOX - Knowledgebase entire database ASCII files to extract and process ecotoxicological - data relevant (but not restricted) to the ecotoxicity groups algae, crustaceans, - and fish in the aquatic domain. The latest version of the ASCII files is available - on US EPA ECOTOX Knowledgebase. The focus is aquatic ecotoxicity and the unit of - the retrieved data is mg/L. -authors: -- family-names: Schulze - given-names: Tobias - email: tobias.schulze@ufz.de - orcid: https://orcid.org/0000-0002-9744-8914 -repository: https://bioconductor.org/ -date-released: '2023-05-02' -contact: -- family-names: Schulze - given-names: Tobias - email: tobias.schulze@ufz.de - orcid: https://orcid.org/0000-0002-9744-8914 -references: -- type: software - title: 'R: A Language and Environment for Statistical Computing' - notes: Depends - url: https://www.R-project.org/ - authors: - - name: R Core Team - location: - name: Vienna, Austria - year: '2023' - institution: - name: R Foundation for Statistical Computing - version: '>= 4.3.0' -- type: software - title: data.table - abstract: 'data.table: Extension of `data.frame`' - notes: Imports - url: https://r-datatable.com - repository: https://CRAN.R-project.org/package=data.table - authors: - - family-names: Dowle - given-names: Matt - email: mattjdowle@gmail.com - - family-names: Srinivasan - given-names: Arun - email: asrini@pm.me - year: '2023' -- type: software - title: dplyr - abstract: 'dplyr: A Grammar of Data Manipulation' - notes: Imports - url: https://dplyr.tidyverse.org - repository: https://CRAN.R-project.org/package=dplyr - authors: - - family-names: Wickham - given-names: Hadley - email: hadley@posit.co - orcid: https://orcid.org/0000-0003-4757-117X - - family-names: François - given-names: Romain - orcid: https://orcid.org/0000-0002-2444-4226 - - family-names: Henry - given-names: Lionel - - family-names: Müller - given-names: Kirill - orcid: https://orcid.org/0000-0002-1416-3412 - - family-names: Vaughan - given-names: Davis - email: davis@posit.co - orcid: https://orcid.org/0000-0003-4777-038X - year: '2023' -- type: software - title: progress - abstract: 'progress: Terminal Progress Bars' - notes: Imports - url: https://github.com/r-lib/progress#readme - repository: https://CRAN.R-project.org/package=progress - authors: - - family-names: Csárdi - given-names: Gábor - - family-names: FitzJohn - given-names: Rich - year: '2023' -- type: software - title: purrr - abstract: 'purrr: Functional Programming Tools' - notes: Imports - url: https://purrr.tidyverse.org/ - repository: https://CRAN.R-project.org/package=purrr - authors: - - family-names: Wickham - given-names: Hadley - email: hadley@rstudio.com - orcid: https://orcid.org/0000-0003-4757-117X - - family-names: Henry - given-names: Lionel - email: lionel@rstudio.com - year: '2023' -- type: software - title: Rdpack - abstract: 'Rdpack: Update and Manipulate Rd Documentation Objects' - notes: Imports - url: https://geobosh.github.io/Rdpack/ - repository: https://CRAN.R-project.org/package=Rdpack - authors: - - family-names: Boshnakov - given-names: Georgi N. - email: georgi.boshnakov@manchester.ac.uk - year: '2023' -- type: software - title: readr - abstract: 'readr: Read Rectangular Text Data' - notes: Imports - url: https://readr.tidyverse.org - repository: https://CRAN.R-project.org/package=readr - authors: - - family-names: Wickham - given-names: Hadley - email: hadley@posit.co - - family-names: Hester - given-names: Jim - - family-names: Bryan - given-names: Jennifer - email: jenny@posit.co - orcid: https://orcid.org/0000-0002-6983-2759 - year: '2023' -- type: software - title: tibble - abstract: 'tibble: Simple Data Frames' - notes: Imports - url: https://tibble.tidyverse.org/ - repository: https://CRAN.R-project.org/package=tibble - authors: - - family-names: Müller - given-names: Kirill - email: kirill@cynkra.com - orcid: https://orcid.org/0000-0002-1416-3412 - - family-names: Wickham - given-names: Hadley - email: hadley@rstudio.com - year: '2023' -- type: software - title: tidyr - abstract: 'tidyr: Tidy Messy Data' - notes: Imports - url: https://tidyr.tidyverse.org - repository: https://CRAN.R-project.org/package=tidyr - authors: - - family-names: Wickham - given-names: Hadley - email: hadley@posit.co - - family-names: Vaughan - given-names: Davis - email: davis@posit.co - - family-names: Girlich - given-names: Maximilian - year: '2023' -- type: software - title: utils - abstract: 'R: A Language and Environment for Statistical Computing' - notes: Imports - authors: - - name: R Core Team - location: - name: Vienna, Austria - year: '2023' - institution: - name: R Foundation for Statistical Computing -- type: software - title: webchem - abstract: 'webchem: Chemical Information from the Web' - notes: Imports - url: https://docs.ropensci.org/webchem/ - repository: https://CRAN.R-project.org/package=webchem - authors: - - family-names: Szöcs - given-names: Eduard - year: '2023' -- type: software - title: BiocStyle - abstract: 'BiocStyle: Standard styles for vignettes and other Bioconductor documents' - notes: Suggests - url: https://github.com/Bioconductor/BiocStyle - repository: https://bioconductor.org/ - authors: - - family-names: Oleś - given-names: Andrzej - orcid: https://orcid.org/0000-0003-0285-2787 - year: '2023' - doi: 10.18129/B9.bioc.BiocStyle -- type: software - title: desc - abstract: 'desc: Manipulate DESCRIPTION Files' - notes: Suggests - url: https://github.com/r-lib/desc#readme - repository: https://CRAN.R-project.org/package=desc - authors: - - family-names: Csárdi - given-names: Gábor - email: csardi.gabor@gmail.com - - family-names: Müller - given-names: Kirill - - family-names: Hester - given-names: Jim - email: james.f.hester@gmail.com - year: '2023' -- type: software - title: knitr - abstract: 'knitr: A General-Purpose Package for Dynamic Report Generation in R' - notes: Suggests - url: https://yihui.org/knitr/ - repository: https://CRAN.R-project.org/package=knitr - authors: - - family-names: Xie - given-names: Yihui - email: xie@yihui.name - orcid: https://orcid.org/0000-0003-0645-5666 - year: '2023' -- type: software - title: markdown - abstract: 'markdown: Render Markdown with ''commonmark''' - notes: Suggests - url: https://github.com/rstudio/markdown - repository: https://CRAN.R-project.org/package=markdown - authors: - - family-names: Xie - given-names: Yihui - email: xie@yihui.name - orcid: https://orcid.org/0000-0003-0645-5666 - - family-names: Allaire - given-names: JJ - - family-names: Horner - given-names: Jeffrey - year: '2023' -- type: software - title: rmarkdown - abstract: 'rmarkdown: Dynamic Documents for R' - notes: Suggests - url: https://pkgs.rstudio.com/rmarkdown/ - repository: https://CRAN.R-project.org/package=rmarkdown - authors: - - family-names: Allaire - given-names: JJ - email: jj@posit.co - - family-names: Xie - given-names: Yihui - email: xie@yihui.name - orcid: https://orcid.org/0000-0003-0645-5666 - - family-names: Dervieux - given-names: Christophe - email: cderv@posit.co - orcid: https://orcid.org/0000-0003-4474-2498 - - family-names: McPherson - given-names: Jonathan - email: jonathan@posit.co - - family-names: Luraschi - given-names: Javier - - family-names: Ushey - given-names: Kevin - email: kevin@posit.co - - family-names: Atkins - given-names: Aron - email: aron@posit.co - - family-names: Wickham - given-names: Hadley - email: hadley@posit.co - - family-names: Cheng - given-names: Joe - email: joe@posit.co - - family-names: Chang - given-names: Winston - email: winston@posit.co - - family-names: Iannone - given-names: Richard - email: rich@posit.co - orcid: https://orcid.org/0000-0003-3925-190X - year: '2023' -- type: software - title: testthat - abstract: 'testthat: Unit Testing for R' - notes: Suggests - url: https://testthat.r-lib.org - repository: https://CRAN.R-project.org/package=testthat - authors: - - family-names: Wickham - given-names: Hadley - email: hadley@posit.co - year: '2023' +# ----------------------------------------------------------- +# CITATION file created with {cffr} R package, v0.5.0 +# See also: https://docs.ropensci.org/cffr/ +# ----------------------------------------------------------- + +cff-version: 1.2.0 +message: 'To cite package "REcoTox" in publications use:' +type: software +title: 'REcoTox: REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII + files' +version: 0.4.1 +abstract: REcoTox is a semi-automated, interactive workflow to process US EPA ECOTOX + Knowledgebase entire database ASCII files to extract and process ecotoxicological + data relevant (but not restricted) to the ecotoxicity groups algae, crustaceans, + and fish in the aquatic domain. The latest version of the ASCII files is available + on US EPA ECOTOX Knowledgebase. The focus is aquatic ecotoxicity and the unit of + the retrieved data is mg/L. +authors: +- family-names: Schulze + given-names: Tobias + email: tsufz1@gmail.com + orcid: https://orcid.org/0000-0002-9744-8914 +repository: https://bioconductor.org/ +date-released: '2023-05-02' +contact: +- family-names: Schulze + given-names: Tobias + email: tsufz1@gmail.com + orcid: https://orcid.org/0000-0002-9744-8914 +references: +- type: software + title: 'R: A Language and Environment for Statistical Computing' + notes: Depends + url: https://www.R-project.org/ + authors: + - name: R Core Team + location: + name: Vienna, Austria + year: '2023' + institution: + name: R Foundation for Statistical Computing + version: '>= 4.3.0' +- type: software + title: data.table + abstract: 'data.table: Extension of `data.frame`' + notes: Imports + url: https://r-datatable.com + repository: https://CRAN.R-project.org/package=data.table + authors: + - family-names: Dowle + given-names: Matt + email: mattjdowle@gmail.com + - family-names: Srinivasan + given-names: Arun + email: asrini@pm.me + year: '2023' +- type: software + title: dplyr + abstract: 'dplyr: A Grammar of Data Manipulation' + notes: Imports + url: https://dplyr.tidyverse.org + repository: https://CRAN.R-project.org/package=dplyr + authors: + - family-names: Wickham + given-names: Hadley + email: hadley@posit.co + orcid: https://orcid.org/0000-0003-4757-117X + - family-names: François + given-names: Romain + orcid: https://orcid.org/0000-0002-2444-4226 + - family-names: Henry + given-names: Lionel + - family-names: Müller + given-names: Kirill + orcid: https://orcid.org/0000-0002-1416-3412 + - family-names: Vaughan + given-names: Davis + email: davis@posit.co + orcid: https://orcid.org/0000-0003-4777-038X + year: '2023' +- type: software + title: progress + abstract: 'progress: Terminal Progress Bars' + notes: Imports + url: https://github.com/r-lib/progress#readme + repository: https://CRAN.R-project.org/package=progress + authors: + - family-names: Csárdi + given-names: Gábor + - family-names: FitzJohn + given-names: Rich + year: '2023' +- type: software + title: purrr + abstract: 'purrr: Functional Programming Tools' + notes: Imports + url: https://purrr.tidyverse.org/ + repository: https://CRAN.R-project.org/package=purrr + authors: + - family-names: Wickham + given-names: Hadley + email: hadley@rstudio.com + orcid: https://orcid.org/0000-0003-4757-117X + - family-names: Henry + given-names: Lionel + email: lionel@rstudio.com + year: '2023' +- type: software + title: Rdpack + abstract: 'Rdpack: Update and Manipulate Rd Documentation Objects' + notes: Imports + url: https://geobosh.github.io/Rdpack/ + repository: https://CRAN.R-project.org/package=Rdpack + authors: + - family-names: Boshnakov + given-names: Georgi N. + email: georgi.boshnakov@manchester.ac.uk + year: '2023' +- type: software + title: readr + abstract: 'readr: Read Rectangular Text Data' + notes: Imports + url: https://readr.tidyverse.org + repository: https://CRAN.R-project.org/package=readr + authors: + - family-names: Wickham + given-names: Hadley + email: hadley@posit.co + - family-names: Hester + given-names: Jim + - family-names: Bryan + given-names: Jennifer + email: jenny@posit.co + orcid: https://orcid.org/0000-0002-6983-2759 + year: '2023' +- type: software + title: tibble + abstract: 'tibble: Simple Data Frames' + notes: Imports + url: https://tibble.tidyverse.org/ + repository: https://CRAN.R-project.org/package=tibble + authors: + - family-names: Müller + given-names: Kirill + email: kirill@cynkra.com + orcid: https://orcid.org/0000-0002-1416-3412 + - family-names: Wickham + given-names: Hadley + email: hadley@rstudio.com + year: '2023' +- type: software + title: tidyr + abstract: 'tidyr: Tidy Messy Data' + notes: Imports + url: https://tidyr.tidyverse.org + repository: https://CRAN.R-project.org/package=tidyr + authors: + - family-names: Wickham + given-names: Hadley + email: hadley@posit.co + - family-names: Vaughan + given-names: Davis + email: davis@posit.co + - family-names: Girlich + given-names: Maximilian + year: '2023' +- type: software + title: utils + abstract: 'R: A Language and Environment for Statistical Computing' + notes: Imports + authors: + - name: R Core Team + location: + name: Vienna, Austria + year: '2023' + institution: + name: R Foundation for Statistical Computing +- type: software + title: webchem + abstract: 'webchem: Chemical Information from the Web' + notes: Imports + url: https://docs.ropensci.org/webchem/ + repository: https://CRAN.R-project.org/package=webchem + authors: + - family-names: Szöcs + given-names: Eduard + year: '2023' +- type: software + title: BiocStyle + abstract: 'BiocStyle: Standard styles for vignettes and other Bioconductor documents' + notes: Suggests + url: https://github.com/Bioconductor/BiocStyle + repository: https://bioconductor.org/ + authors: + - family-names: Oleś + given-names: Andrzej + orcid: https://orcid.org/0000-0003-0285-2787 + year: '2023' + doi: 10.18129/B9.bioc.BiocStyle +- type: software + title: desc + abstract: 'desc: Manipulate DESCRIPTION Files' + notes: Suggests + url: https://github.com/r-lib/desc#readme + repository: https://CRAN.R-project.org/package=desc + authors: + - family-names: Csárdi + given-names: Gábor + email: csardi.gabor@gmail.com + - family-names: Müller + given-names: Kirill + - family-names: Hester + given-names: Jim + email: james.f.hester@gmail.com + year: '2023' +- type: software + title: knitr + abstract: 'knitr: A General-Purpose Package for Dynamic Report Generation in R' + notes: Suggests + url: https://yihui.org/knitr/ + repository: https://CRAN.R-project.org/package=knitr + authors: + - family-names: Xie + given-names: Yihui + email: xie@yihui.name + orcid: https://orcid.org/0000-0003-0645-5666 + year: '2023' +- type: software + title: markdown + abstract: 'markdown: Render Markdown with ''commonmark''' + notes: Suggests + url: https://github.com/rstudio/markdown + repository: https://CRAN.R-project.org/package=markdown + authors: + - family-names: Xie + given-names: Yihui + email: xie@yihui.name + orcid: https://orcid.org/0000-0003-0645-5666 + - family-names: Allaire + given-names: JJ + - family-names: Horner + given-names: Jeffrey + year: '2023' +- type: software + title: rmarkdown + abstract: 'rmarkdown: Dynamic Documents for R' + notes: Suggests + url: https://pkgs.rstudio.com/rmarkdown/ + repository: https://CRAN.R-project.org/package=rmarkdown + authors: + - family-names: Allaire + given-names: JJ + email: jj@posit.co + - family-names: Xie + given-names: Yihui + email: xie@yihui.name + orcid: https://orcid.org/0000-0003-0645-5666 + - family-names: Dervieux + given-names: Christophe + email: cderv@posit.co + orcid: https://orcid.org/0000-0003-4474-2498 + - family-names: McPherson + given-names: Jonathan + email: jonathan@posit.co + - family-names: Luraschi + given-names: Javier + - family-names: Ushey + given-names: Kevin + email: kevin@posit.co + - family-names: Atkins + given-names: Aron + email: aron@posit.co + - family-names: Wickham + given-names: Hadley + email: hadley@posit.co + - family-names: Cheng + given-names: Joe + email: joe@posit.co + - family-names: Chang + given-names: Winston + email: winston@posit.co + - family-names: Iannone + given-names: Richard + email: rich@posit.co + orcid: https://orcid.org/0000-0003-3925-190X + year: '2023' +- type: software + title: testthat + abstract: 'testthat: Unit Testing for R' + notes: Suggests + url: https://testthat.r-lib.org + repository: https://CRAN.R-project.org/package=testthat + authors: + - family-names: Wickham + given-names: Hadley + email: hadley@posit.co + year: '2023' diff --git a/DESCRIPTION b/DESCRIPTION index 1d0cff7..40afab6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,13 +1,13 @@ Package: REcoTox Type: Package Title: REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII files -Version: 0.4.0 -Date: 2023-5-2 +Version: 0.4.1 +Date: 2023-4-11 Authors@R: c(person(given = "Tobias", family = "Schulze", role = c("aut", "cre"), - email = "tobias.schulze@ufz.de", + email = "tsufz1@gmail.com", comment = c(ORCID = "0000-0002-9744-8914")), person(given = "Wibke", family = "Busch", @@ -34,7 +34,7 @@ Imports: Rdpack, readr, tibble, - tidyr, + tidyr, utils, webchem Suggests: @@ -43,7 +43,9 @@ Suggests: knitr, markdown, rmarkdown, - testthat + testthat, + kableExtra, + tidyverse VignetteBuilder: knitr RdMacros: diff --git a/inst/CHANGELOG.md b/inst/CHANGELOG.md index f8fbe6c..b05b656 100644 --- a/inst/CHANGELOG.md +++ b/inst/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +*Version 0.4.1 (4 November 2023)* + +- Add vignette + *Version 0.4.0 (2 May 2023)* - Refactorization of the chemical list and the final results schemes diff --git a/inst/extdata/database_folder/project.RData b/inst/extdata/database_folder/project.RData new file mode 100644 index 0000000..07ab5b1 Binary files /dev/null and b/inst/extdata/database_folder/project.RData differ diff --git a/inst/extdata/database_folder/tests.txt b/inst/extdata/database_folder/tests.txt index bbcb3b0..25c35eb 100644 --- a/inst/extdata/database_folder/tests.txt +++ b/inst/extdata/database_folder/tests.txt @@ -1,4 +1,4 @@ -test_id|reference_number|cas_number|test_grade|test_grade_comments|test_formulation|test_formulation_comments|test_radiolabel|test_radiolabel_comments|test_purity_mean_op|test_purity_mean|test_purity_min_op|test_purity_min|test_purity_max_op|test_purity_max|test_purity_comments|test_characteristics|species_number|organism_habitat|organism_source|organism_source_comments|organism_lifestage|organism_lifestage_comments|organism_age_mean_op|organism_age_mean|organism_age_min_op|organism_age_min|organism_age_max_op|organism_age_max|organism_age_unit|organism_init_wt_mean_op|organism_init_wt_mean|organism_init_wt_min_op|organism_init_wt_min|organism_init_wt_max_op|organism_init_wt_max|organism_init_wt_unit|organism_characteristics|organism_gender|experimental_design|study_duration_mean_op|study_duration_mean|study_duration_min_op|study_duration_min|study_duration_max_op|study_duration_max|study_duration_unit|study_duration_comments|exposure_duration_mean_op|exposure_duration_mean|exposure_duration_min_op|exposure_duration_min|exposure_duration_max_op|exposure_duration_max|exposure_duration_unit|exposure_duration_comments|study_type|study_type_comments|test_type|test_type_comments|test_location|test_location_comments|test_method|test_method_comments|exposure_type|exposure_type_comments|control_type|control_type_comments|media_type|media_type_comments|num_doses_mean_op|num_doses_mean|num_doses_min_op|num_doses_min|num_doses_max_op|num_doses_max|num_doses_comments|other_effect_comments|application_freq_mean_op|application_freq_mean|application_freq_min_op|application_freq_min|application_freq_max_op|application_freq_max|application_freq_unit|application_freq_comments|application_type|application_type_comments|application_rate|application_rate_unit|application_date|application_date_comments|application_season|application_season_comments|subhabitat|subhabitat_description|substrate|substrate_description|water_depth_mean_op|water_depth_mean|water_depth_min_op|water_depth_min|water_depth_max_op|water_depth_max|water_depth_unit|water_depth_comments|geographic_code|geographic_location|latitude|longitude|halflife_mean_op|halflife_mean|halflife_min_op|halflife_min|halflife_max_op|halflife_max|halflife_unit|halflife_comments|additional_comments|created_date|modified_date|published_date +test_id|reference_number|test_cas|test_grade|test_grade_comments|test_formulation|test_formulation_comments|test_radiolabel|test_radiolabel_comments|test_purity_mean_op|test_purity_mean|test_purity_min_op|test_purity_min|test_purity_max_op|test_purity_max|test_purity_comments|test_characteristics|species_number|organism_habitat|organism_source|organism_source_comments|organism_lifestage|organism_lifestage_comments|organism_age_mean_op|organism_age_mean|organism_age_min_op|organism_age_min|organism_age_max_op|organism_age_max|organism_age_unit|organism_init_wt_mean_op|organism_init_wt_mean|organism_init_wt_min_op|organism_init_wt_min|organism_init_wt_max_op|organism_init_wt_max|organism_init_wt_unit|organism_characteristics|organism_gender|experimental_design|study_duration_mean_op|study_duration_mean|study_duration_min_op|study_duration_min|study_duration_max_op|study_duration_max|study_duration_unit|study_duration_comments|exposure_duration_mean_op|exposure_duration_mean|exposure_duration_min_op|exposure_duration_min|exposure_duration_max_op|exposure_duration_max|exposure_duration_unit|exposure_duration_comments|study_type|study_type_comments|test_type|test_type_comments|test_location|test_location_comments|test_method|test_method_comments|exposure_type|exposure_type_comments|control_type|control_type_comments|media_type|media_type_comments|num_doses_mean_op|num_doses_mean|num_doses_min_op|num_doses_min|num_doses_max_op|num_doses_max|num_doses_comments|other_effect_comments|application_freq_mean_op|application_freq_mean|application_freq_min_op|application_freq_min|application_freq_max_op|application_freq_max|application_freq_unit|application_freq_comments|application_type|application_type_comments|application_rate|application_rate_unit|application_date|application_date_comments|application_season|application_season_comments|subhabitat|subhabitat_description|substrate|substrate_description|water_depth_mean_op|water_depth_mean|water_depth_min_op|water_depth_min|water_depth_max_op|water_depth_max|water_depth_unit|water_depth_comments|geographic_code|geographic_location|latitude|longitude|halflife_mean_op|halflife_mean|halflife_min_op|halflife_min|halflife_max_op|halflife_max|halflife_unit|halflife_comments|additional_comments|created_date|modified_date|published_date 1000494|95|1912249|||PU|||||99.9|||||||58471|Water|||EX|||||||||||||||||||||||||||||||||||||||LAB||||S||S||FW|||||||||RECOVERY||1|||||X|||||||||||||||||||||||||||||||||| |10/06/1992|| 1000495|95|1912249|||PU|||||99.9|||||||58471|Water|||EX|||||||||||||||||||||||||||||||||||||||LAB||||S||S||FW|||||||||RECOVERY||1|||||X|||||||||||||||||||||||||||||||||| |10/06/1992|| 1000496|95|1912249|||PU|||||99.9|||||||58471|Water|||EX|||||||||||||||||||||||||||||||||||||||LAB||||S||S||FW|||||||||RECOVERY||1|||||X|||||||||||||||||||||||||||||||||| |10/06/1992|| diff --git a/vignettes/REcoTox.R b/vignettes/REcoTox.R new file mode 100644 index 0000000..e65f7ff --- /dev/null +++ b/vignettes/REcoTox.R @@ -0,0 +1,215 @@ +## ----biocstyle, echo = FALSE, messages = FALSE, results = "hide"-------------- +BiocStyle::markdown() + +## ----init, message = FALSE, echo = FALSE, results = "hide"-------------------- +## Silently loading all packages +library(BiocStyle) +library(desc) +library(kableExtra) +library(tidyverse) + +## ----load REcoTox package, eval = FALSE, echo = TRUE, message = FALSE, warning = FALSE---- +# # Load the REcoTox package +# library(REcoTox) + +## ----R Documentation, echo = TRUE, eval = FALSE------------------------------- +# # Documentation of REcoTox +# help(package = "REcoTox") + +## ----initialize folders, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE---- +# Path of the project folder +project_folder <- "REcoTox_demo" + +database_folder <- system.file("extdata/database_folder", package="REcoTox") +# The project folder is created in the home directory +project_path <- normalizePath(ifelse(.Platform$OS.type == "unix", + paste0("~/", project_folder), + paste0( + Sys.getenv("HOMEPATH"), + "\\", + project_folder + ) +)) + +# An existing folder is deleted +if (dir.exists(project_folder)) { + unlink(project_folder, recursive = TRUE) +} + +## ----create project, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE---- +project <- REcoTox::create_project(database_path = database_folder, + project_path, + initalise_database_project = TRUE, # create the basic project from current ASCII files in DB folder + initalise_project = TRUE, # initializes the project folder + load_default = FALSE) # loads the default project in the project folder in the memoryfault_example = TRUE + +file.copy( + from = system.file( + "extdata", + "Query_EcoTox_DB.R", + package = "REcoTox" + ), + to = normalizePath( + path = file.path( + project_folder, + "Query_EcoTox_DB.R" + ), + winslash = "\\", + mustWork = FALSE + ), + overwrite = TRUE + ) + + +## ----list project folder------------------------------------------------------ +# List files and directories in project_folder +list.files(project_folder, recursive = TRUE, include.dirs = TRUE) + +## ----list database folder----------------------------------------------------- +# List files and directories in project_folder +list.files(database_folder, recursive = TRUE, include.dirs = TRUE) + +## ----view chemical_properties, echo = TRUE, eval = TRUE, message = TRUE------- +# Review of the chemical properties +chemical_properties <- readr::read_csv(file = normalizePath(path = file.path( + database_folder, + "chemical_properties.csv" +), ), show_col_types = FALSE) + +kable( + chemical_properties %>% + head(5), + format = "html", + digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") + +## ----view results, echo = TRUE, eval = TRUE, message = TRUE------------------- +# Review of the result table +results <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "results.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + results %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") + +## ----view chemicals, echo = TRUE, eval = TRUE, message = TRUE----------------- +# Review of the substance_table +substances <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "chemicals.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + substances %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") + +## ----view references, echo = TRUE, eval = TRUE, message = TRUE---------------- +# Review of the substance_table +references <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "references.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + references %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") + +## ----view species, echo = TRUE, eval = TRUE, message = TRUE------------------- +# Review of the substance_table +species <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "species.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + species %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") + +## ----initialize databases, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE---- +# project <- REcoTox::create_project(database_path = database_folder, +# project_path, +# initalise_database_project = TRUE, +# initalise_project = TRUE, +# load_default = FALSE) + +## ----initialize project, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE---- +# project <- REcoTox::prepare_data(project = project, +# load_initial_project = FALSE, +# new_project_path = NA, +# save_project = TRUE +# ) + +## ----view pivot tables, echo = FALSE, eval = TRUE, message = TRUE------------- +# Review of the privot table +pivot <- + project$object$results_pivot + +kable( + pivot %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") + +## ----sessioninfo, echo = TRUE, eval = TRUE, message = FALSE------------------- +sessionInfo() + +## ----clean_up, echo = FALSE, results = "asis", eval = FALSE------------------- +# unlink(project_folder, recursive = TRUE) + diff --git a/vignettes/REcoTox.Rmd b/vignettes/REcoTox.Rmd index b321fe3..9f60d6d 100644 --- a/vignettes/REcoTox.Rmd +++ b/vignettes/REcoTox.Rmd @@ -1,1613 +1,640 @@ ---- -title: "MZquant - high-throughput target screening (v`r desc::desc_get_version()`)" -shorttitle: "MZquant (version `r desc::desc_get_version()`)" -author: -- name: Tobias Schulze - affiliation: Helmholtz Centre for Environmental Research - UFZ, Leipzig, Germany - email: tobias.schulze@ufz.de -- name: Eric Carmona Martínez - affiliation: Helmholtz Centre for Environmental Research - UFZ, Leipzig, Germany -date: 2021-10-27 -output: - BiocStyle::html_document: - toc_float: true -includes: - in_header: mzquant.bioschemas.html -vignette: > - %\VignetteIndexEntry{MZquant - high-throughput target screening} - %\VignetteKeywords{Mass Spectrometry, Analytical Chemistry, quantification} - %\VignetteDepends{BiocStyle, desc} - %\VignettePackage{MZquant} - %\VignetteEncoding{UTF-8} - %\VignetteEngine{knitr::rmarkdown} -bibliography: references.bib -csl: biomed-central.csl -editor_options: - markdown: - wrap: 200 ---- - -```{r biocstyle, echo = FALSE, messages = FALSE, results = "hide"} -BiocStyle::markdown() -``` - -```{r init, message = FALSE, echo = FALSE, results = "hide" } -## Silently loading all packages -library(BiocStyle) -library(desc) -library(dplyr) -library(kableExtra) -``` - -# Introduction - -`MZquant` is a comprehensive, semi-automated computational mass -spectrometry workflow for the high-throughput target screening. -`MZquant` builds on the compelling mass spectrometry suite -[`MZmine3`](https://mzmine.github.io "MZmine 3") [@MZmine2; -@Pluskal_Castillo_Villar-Briones_Oresic_2010]. `MZmine 3` is required -for preprocessing of the raw mass spectrometrical data. However, it -includes no advanced module for the targeted screening. - -`MZquant` processes the `MZmine 3` export files in `csv` format in the -following main steps: - -- Feature list workflow - - - preprocessing of the `MZmine 3` output file including selection - of custom-defined columns to remove unused columns from the full - `MZmine 3` output. - - - interactive editing of the processed `csv` to justify column - names or else (optional) - - - preparation of internal feature lists (e.g., samples, blanks, - quantification, quality control) - - - assignment of a standardized name for all compounds to prevent - name confusions during processing - - - interactive removal of duplicate annotations and review of - missing compounds - -- Blank workflow - - - annotation of blank features - - - removal of blank features by different computational approaches - (optional) - -- Quantification workflow - - - automated, interactive assignment of quantification levels - - - annotation of all features with the nearest neighbor internal - standard or a preselected internal standard, if defined in the - `substance_table` - - - normalization of all features to the assigned internal standard - - - modelling and trimming of the quantification models - - - fitting of quantification models - - - interactive trimming of compounds with failed automated trimming - - - automated quantification of the samples and export to a `csv` - file - -`MZquant` processes the `MZmine 3` export files in `csv` format. It expects an -aligned feature list with full annotation of the features of interest. - -**Note:** `MZquant` expects the use of spiked (isotopically labeled) internal -standards because of quality assurance. Samples, calibrations, and quality -standards without spiked internal standards cannot be processed. In best case, -the mix of standards spans the full chromatographic range. Spiked blanks -should be treated as samples, because the features related to internal -standards might be removedby blank correction. - -**Note for users of `MZquant` versions <0.8.0**: The change to the new -`MZmine 3` output format required a refactorisation of the internal -data model in `MZquant`. Unfortunately, old projects cannot be reprocessed. - -# Input files - -`MZquant` requires a comma separated file (`csv`) in the `MZmine 3` -format ([Figure 1](#fig1)). The standardized delimiter of `csv` is a -`comma`. Other delimiters are not supported. If in-field separation is -required, for example in comment field, the use of the pipe `|` symbol -is recommended or semicolon `;` ([Figure 2](#fig2)). However, it is -strongly recommended to use the pipe `|`. - - -```{r} -#| fig1, -#| fig.cap = "Export feature lists in MZmine 3.", -#| fig.align = "center", out.width = "75%", -#| echo = FALSE, eval = TRUE -knitr::include_graphics("./figures/Figure_1.png") -``` - -```{r} -#| fig2, -#| fig.cap = "Export options in MZmine 3.", -#| fig.align = "center", out.width = "75%", -#| echo = FALSE, eval = TRUE -knitr::include_graphics("./figures/Figure_2.png") -``` - -**Notes** - -- It is recommended to use the identical `substance_table` for the annotation - in `MZmine 3` and `MZquant` for consistent results. - -- Do not change the `MZmine 3` output file manually to avoid a - corruption. Run `feature_list_workflow(step = 1)` to clean the file - first before preview and any editing. - -## New field schema in `MZmine 3` output format - -### Field groups in `MZmine 3` output format - -In the `MZmine 3` output format, related columns are tagged with a group -prefix: - -- `manual_annotation:` tags related to manual annotations in - `MZmine 3` - -- `compound_db_identity:` tags related to automated annotations using - a custom database - -- `datafile:` columns containing sample data - -- other columns like `id`, `mz`, or `rt` - -The overall number of columns exported is immense. Do not edit the -original output, but the refined file after running -`feature_list_workflow(step = 1)`. - -### Response format - -The response (i.e. `area` and `intensity`) is tagged by a suffix of the -`datafiles`, for example: - -`datafile:file_name.mzML:area` or `datafile:file_name.mzML:intensity` - -`MZmine 3` exports both, `area` and `intensity`. In the `MZquant_settings.yaml` -of `MZquant`, it is possible to select the preferred method for -quantification. The default is `area`. - -## Name conventions for smart processing in `MZquant` - -### Calibrations - -The calibration columns need to be tagged by a name like "Cal", "Calib" -or "Calibration" followed by the concentration level of the standard. -The exact tag needs to be specified in the `MZquant_settings.yaml` (details see -below). - -*A good name of a column representing a calibration sample looks like:* - -`datafile:190303_09_Cal_100_ngL_2018.mzML:area` - -In case, the calibration levels contain not a decimal (i.e., 0.5ng L), -but it is as "p" (i.e. 0p5), replace the "p" by a "." and add an -underline between the value and the unit. - -*For example:* - -`datafile:190303_09_Cal_0.5_ngL_2018.mzML:area` - -*Should be replaced by:* - -`datafile:190303_09_Cal_0p5ngL_2018.mzML:area` - -With a correct tagging of the calibration levels, it is possible to -generate the calibration-to-levels table automatically. - -### Blanks - -Blanks should be tagged with the name `Blank` or `blank`. The exact tag -needs to be specified in the settings (see below). It is recommended to -remove the first blank of a sequence because it is very noisy due to -carry-over of previous measurements. - -*Examples for correct tagging:* - -`datafile:190303_15_Blank95.5.mzML:area` - -`datafile:190303_16_Trip_Blank.mzML:area` - -### Quality Controls (QC) - -Usually at the end of every batch, you can find some calibration samples -for quality control to check if the chromatography and acquisition was -stable during the batch run. The name pattern of these samples is -usually similar to the name pattern of the calibration. To use those -samples for quality control, replace `Calib` or `Cal` by `QC`. If not, -these samples will be handled as additional calibration points. - -### Internal Standards (IS) - -In general, it is necessary to spike a mix of internal standards (IS) in -your samples for quality assurance (for example to correct for matrix -effects). - -**Note:** Without IS spiked into your samples, you cannot use MZquant. - -The IS are allocated at the end of the `substance_table`. In addition to -the main IS, the list includes the isotopes and adducts of many IS. The -regular name of an IS ends with `M+H` or `M-H` The suspect internal -standards can end by `_13C`, `_13S`, `_34S`, `_37Cl` among others. - -The names of internal standards must be tagged with an unequivocal -prefix, for example `IS_compound_name`. The tag must be defined in the -`MZquant_settings.yaml`. - -Check the raw data for missing IS. Almost all IS should be found in the -samples, if not, redo the `MZmine 3` analysis with refined parameters. - -If an IS cannot be found or contains more than 50% of missing values, -open the `MZquant_substances.csv` and re-classify the IS entry as a `Suspect` -(column `Class`) and set the quantification tag to `auto`. - -You may consider using an isotope or adduct as an alternative? No -problem. In this case, change the column `Class` to `Internal Standard` -and set the quantification tag to `IS`. - -However, missing values in the IS data can be handled automatically by -`MZquant`. Do not impute any value. The only appropriate imputation -method is K nearest neighbors (KNN), which is used by `MZquant`. - -Other values such as the mean or median do bias the data. But in the -case of more than 50% of missing values, the KNN algorithm imputes the -mean values, because KNN cannot handle too many missing values. To -customize, this setting can be changed in the `MZquant_settings.yaml`. - -In the latter case, you may consider removing the IS or replacing it by -the isotope or adduct anyway. The re-classification of IS is also -possible on a later stage in the `MZquant` feature list workflow. Hence, -no action is required at this stage. - -If an IS was not found in any sample, calibration or blank, it will be -separated from the dataset such as missing compounds. - -### Important Note - -**Never** delete rows from your feature list or the substance file. This -is a major cause of failures. Missing compounds and internal standards -in the feature list will be removed from analysis automatically. - -Furthermore, `MZquant` provides procedures to remove or rename -annotations, and to reclassify internal standards. `MZquant` cares about -the changes in all tables including the `substance_table`. - -Experienced users can edit the annotations in `MZmine` or using the -refined `MZmine` output file in an external spreadsheet application -(preferable LibreOffice [@LibreOffice]). - -# Using MZquant - -The following tutorial explains the different steps of `MZquant` in a -comprehensive demonstration. `MZquant` includes different interactive -steps, which require the evaluation of comma separated text files -(`*.csv`) in an external spreadsheet application (preferable LibreOffice -[@LibreOffice]). - -## Load the MZquant package - -```{r load MZquant package, eval = TRUE, echo = TRUE, message = FALSE, warning = FALSE} -# Load the MZquant package -library(MZquant) -``` - -## Documentation for MZquant - -A detailed description of all functions of `MZquant` is available in the -`R Documentation`. - -```{r R Documentation, echo = TRUE, eval = FALSE} -# Documentation of MZquant -help(package = "MZquant") -``` - -## Preparation of the working environment (for beginners) - -The processing of annotated feature lists in MZquant is semi-automated -controlled by a processing script `MZquant_processing_script.R` and a -settings file `MZquant_settings.yaml`. - -If you run MZquant for the first time, a tutorial project is available -to demonstrate all important steps of MZquant processing. The following -script is preparing an example folder in your home directory and copies -all necessary files in the folder. - -```{r initialize project folder, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -# Path of the project folder -project_folder <- "your_folder" -settings_file <- "MZquant_settings.yaml" - -# The project folder is created in the home directory -project_folder <- normalizePath(ifelse(.Platform$OS.type == "unix", - paste0("~/", project_folder), - paste0( - Sys.getenv("HOMEPATH"), - "\\", - project_folder - ) -)) - -# An existing folder is deleted -if (dir.exists(project_folder)) { - unlink(project_folder, recursive = TRUE) -} -``` - -This command initializes the project folder and copies the demonstration -project in the project folder. - -```{r initialize project, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -# Initialization and setup of the project folder. -# default_settings = TRUE loads the default settings file -# default_example = TRUE loads the example files -initialise_project(project_folder, settings_file, - default_processing_script = TRUE, - default_settings = TRUE, - default_example = TRUE -) -``` - -The `project_folder` contains the following files and directories: - -```{r list project folder} -# List files and directories in project_folder -list.files(project_folder, recursive = TRUE, include.dirs = TRUE) -``` - -`MZquant_processing_script.R` is the processing script of the MZquant -workflow, `MZquant_settings.yaml` is the settings file, -`MZquant_samples.csv` contains the results exported of MZmine, and -`MZquant_substances.csv` contains the chemical information on the -targets used for annotation. - -## Review of the input data - -To review the input data, let us look in the data in a short view. For a -detailed review, use the next chunk. - -```{r view samples, echo = TRUE, eval = TRUE} -# Review of the samples table -samples <- readr::read_csv(file = normalizePath(path = file.path( - project_folder, - "MZquant_samples.csv" -), ), show_col_types = FALSE) - -kable( - samples %>% - head(5) %>% - dplyr::select( - id, mz, rt, `compound_db_identity:compound_db_identity`, - `datafile:220101_18_ESIpos_Calib_std_1000_ngL.mzML:area` - ), - format = "html", digits = 2 -) -``` - -```{r view substances, echo = TRUE, eval = TRUE} -# Review of the substance_table -substances <- - readr::read_csv( - file = normalizePath( - path = file.path( - project_folder, - "MZquant_substances.csv" - ), - ), - show_col_types = FALSE - ) - -kable( - substances %>% - head(5) %>% - dplyr::select( - id, mz, rt, compound_id, `mzquant:compound_class`, - `mzquant:mode` - ), - format = "html", digits = 2 -) -``` - -To get a detailed overview of the input data, run the following chunk. - -```{r view input data, echo = TRUE, eval = FALSE} -# Review input data in detail -samples <- readr::read_csv( - file.path( - project_folder, - "MZquant_samples.csv" - ), - show_col_types = FALSE -) -View(samples) - -substances <- readr::read_csv( - file.path( - project_folder, - "MZquant_substances.csv" - ), - show_col_types = FALSE -) -View(substances) -``` - -## Preparation of the working environment (for experienced users) - -Experienced users can re-use existing `MZquant_processing_script.R` and -`MZquant_settings.yaml` files.^[Note that the workflow and the settings file can change in a new - package version and thus a review of the change log (NEWS) is recommended.] - -The steps to setup a custom project are: - -1. Create a project folder.^[It is recommended to use the same name as the analytical batch.] - -2. Copy the custom `MZquant_substances.csv` and the `MZquant_samples.csv` in the folder. - -3. Re-use existing `MZquant_processing_script.R` and/or `MZquant_settings.yaml` - files. - -4. Or, create new files with the following command. - -```{r initialize new processing files, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} - -# Initialization and setup of the project folder -project_folder <- "your_folder" -settings_file <- "MZquant_settings.yaml" - -# default_settings = TRUE loads the default settings file -# default_processing_script = TRUE loads the default processing script -initialise_project(project_folder, settings_file, - default_processing_script = TRUE, - default_settings = TRUE, - default_example = FALSE -) -``` - -The command will initialize the project folder and copy the default -`MZquant_settings.yaml` and the default `MZquant_processing_script.R` in -your project folder. If you copied existing files in step 3 with similar -names, they will be overwritten by step 4. - -## MZquant processing and settings - -Open the `MZqant_processing_script.R` and the settings file -`MZquant_settings.yaml` for review and further processing. - -```{r load processing and settings file, echo = TRUE, eval = FALSE} - -# Open the processing script -file.edit(file.path(project_folder, "MZquant_processing_script.R")) - -# Open the settings file -file.edit(file.path(project_folder, settings_file)) -``` - -### The processing script - -The `MZquant_processing_script.R` contains all necessary steps of the -MZquant workflow described in this vignette. - -The data of the `MZquant` project is stored in a hidden environment -`.MZquant.env`. - -To review the hidden environment, you may call. - -```{r call enviroment, eval = FALSE, echo = TRUE} -View(.MZquant.env) -``` - -The `.MZquant.env` is stored after each processing step to ensure easy -redo of single processing steps without need to repeat the whole -analysis. - -### The settings file - -The settings file `MZquant_settings.yaml` contains all necessary and -customization settings. Scroll thought the document and make all -required edits, for example: - -- How are the blanks tagged (e.g., `Blank`)? -- How are the calibration columns tagged? -- How are the quality control columns tagged? -- Should a blank correction be performed? -- Which type of blank correction is performed? - -Finally, save the file and go back to the processing script -`MZquant_processing_script.R`. - -The settings file is read in the current `.MZquant.env` environment by -running: - -```{r read in settings, eval = FALSE, echo = TRUE} -# Call the read_settings function to read the filed settings -# into the current `.MZquant.env` -read_settings(settings_file, project_folder) -``` - -The `read_settings()` function is especially helpful if settings are -changed during analysis or old projects are reprocessed with a newer -version of `MZquant`. In the latter case, new features or bug fixes can -be easily applied without need of the tedious reprocessing of the entire -data. - -## Feature list workflow - -The `feature_list_workflow` reads the `MZquant_samples.csv` and the -`MZquant_substances.csv`. It prepares the feature tables for the -analysis in MZquant. - -### Feature list workflow step 1 - -Step 1: Read `MZmine 3` exported feature list `MZquant_samples.csv` and -the substance file `MZquant_substances.csv`. - -```{r} -#| feature list workflow step 1, echo = TRUE, -#| message = FALSE, warning = FALSE, eval = TRUE - -# Feature list workflow step 1 -feature_list_workflow(step = 1) -``` - -- Read the `MZmine 3` exported feature list and the `substance_table`. - -- Merge the tables. - -- Assign standard names (`compound_db_identity:stdname`) to the - annotations in the `feature list` and the `substance_table`. - -- Assign the response method set in `MZquant_settings.yaml:data:response`. - -- Select mandatory and custom fields set in - `MZquant_settings.yaml:data:meta_mzmine`. - -- Export the new table to `MZquant_samples_refined.csv` in the - `project_folder`. - -The `feature_liste_workflow` step 1 applies a `regular expression` to -the `compound_identity` in the `substance_table`, respectively, to the -column `compound_db_identity:compound_db_identity` in the -`MZquant_samples.csv` to replace any punctuation in the names to ensure -unequivocal standardized names called `stdname`. The `stdname` is one of -the `keys` used in `MZquant`. - -The regular expression is: - -$$stdname = stringi::stri\_replace\_all\_regex(name, "[[:punct:] \backslash \backslash s]+", "\_")$$ - -For example, `3,3'-Dichlorobenzidine` is expressed as -`3_3_Dichlorobenzidine`, which is unequivocally processing. - -In the `samples_refined.csv`, the `stdname` is stored in -`compound_db_identity:compound_db_identity:stdname.` - -Review this table. For experienced users: Take the opportunity to edit -the table to your purposes, for example, add missing QC data by copying -of calibration data. - -*Optional: Load the environment after running `quantification_workflow(step = 1)`* - -The working environment is stored after each processing step in the root -of the project folder. To reload the environment, run the `load` -command. This is for example helpfully, if a processing required to -break for a while or a `bug` in a later step was tracked down. - -```{r load feature_list_workflow_1.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `feature_list_workflow(step = 1) -load( - file = file.path(project_folder, "MZquant_feature_list_workflow_1.RData"), - envir = .GlobalEnv -) -``` - -### Feature list workflow step 2 - -Step 2: Read and preprocess the refined feature list `MZquant_samples_refined.csv`. - -```{r feature list workflow step 2, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -# Feature list workflow step 2 -feature_list_workflow(step = 2) -``` - -- Read the MZmine export.csv file and the `substance_table`. - -- Prepare the internal feature list structure. - -- Match the annotations in the feature list with the - `substance_table`. - -- Filter missing substances. - -The columns in `MZquant_settings.yaml:data:meta_mzmine` and -`MZquant_settings.yaml:data:meta_substances` are appended. The -`feature_list_workflow` step 2 stores three files in the -`results/feature_lists` folder. - -```{r list feature_list folder, echo = TRUE} -# List files and directories in the feature list folder -list.files(file.path(project_folder, "results", "feature_lists"), - include.dirs = TRUE -) -``` - -The `target_table_for_review.csv` is the relevant spreadsheet for all -edits in the annotated feature list. Open the list in your favorite -spreadsheet application. To preserve the edits for later review, the -file could be saved in the native spreadsheet format (e.g. `ods` or -`xlsx`). - -The only columns to be edited (do not edit any other column or delete -any row or entry, this will have side effects and you need to repeat -this step): - -- `remove_annotation` -\> set to `1` for removing annotations - -- `newstdname` -\> add a new `stdname` (from - `substance_table_for_review.csv`), if necessary (e.g., in case of - *duplicate masses*) - -- `new_class` -\> class of the annotation (for example of an internal - standard), `c(Target, Suspect, Internal Standard)` - -See the green highlighted columns in [Figure 3](#fig3). - -#### Tips for the data review - - -**Hide columns, use auto filters and freeze rows/columns ([Figure 3](#fig3))** - -- Hide all columns you do not need for review (e.g., `stdname`, - `compound_id`, `mzquant_id`). - -- Mark the first row and add the auto filter (data \| AutoFilter). - -- Use the Freeze Rows and Columns functions to freeze the header and - the column with the `StdName`. - -- Columns could be re-arranged, but never delete columns or remove - rows! - -**Hide columns, use auto filters and freeze rows/columns** - -- Use color scales to mark the the blank data the sample data ([Figure - 4](#fig4)). - -- Use the scientific notation for theses columns for straightforward - comparison. - -- Add `duplicate` conditional formatting to column `stdname` to - highlight duplicate names ([Figure 5](#fig5)) - - - -```{r} -#| fig3, fig.cap = "Hide and freeze columns.", -#| fig.align = "center", echo = FALSE, out.width = "80%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_3.png") -``` - - -```{r} -#| fig4, fig.cap = "Highlight the data with color scales.", -#| fig.align = "center", echo = FALSE, out.width = "100%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_4.png") -``` - - -```{r} -#| fig5, -#| fig.cap = "Use conditional formatting to annotate duplicates in stdname and remove\\_annotation.", -#| fig.align = "center", -#| echo = FALSE, out.width = "75%", eval = TRUE -knitr::include_graphics("./figures/Figure_5.png") -``` - -**Data review** - -The selection of the correct annotation is not always -straightforward. `MZquant` delivers different qualifiers to support the -decision ([Figure 6](#fig6)). - -- Decision criteria, review the following columns: - - - expected `mz` from `MZquant_substances.csv` - - - expected `rt` from `MZquant_substances.csv` - - - detected `row_mz` in MZmine 3 - - - detected `row_rt` in MZmine 3 - - - `annotation_score` estimated in MZmine 3 - - - `row_mz_delta_ppm` estimated in MZmine 3 - - - `deltart` as the difference between expected and detected - retention time - - - `deltamz` as the difference between expected and detected mz - - - `mz_ppm_error` in ppm estimated in MZquant - - - `quantification_monotonicity`` to show monotonicity of the - quantification levels - - - `counts` in blanks, calibrations, samples, and quality controls - - - `blank_features_threshold`^[The `blank_features_threshold` is the value which is calculated by - the blank correction and is the level of blank noise. All features - with lower intensities than the threshold will be removed in the - blank_workflow.] - - - maximum value in the samples - - -```{r} -#| fig6, fig.cap = "Criteria for the removal of duplicate annotations.", -#| fig.align = "center", echo = FALSE, out.width = "90%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_6.png") -``` - -- Lookup for compounds with low findings in the samples, e.g.: - - - select for example those records with low sample annotations - - - lookup for the better annotation score - - - check if the compound occurs only in the first consecutive - sample following the highest calibration level - - - check for their intensity, if they occur only at the noise level - (1e4. 1e3, compare to blanks), they could be carry over from the - calibration sample - - - check if they only occur in QC samples and so on - - - remove the annotation if the compound is carry over, not - occurring in samples etc. by setting the value in - `remove_annotation` to `1` - - - **Note**: because of the blank correction, some compound without - values in the samples will occur in the trimming table - -- Lookup for duplicates - - - Duplicates should be removed from the table; they occur from - broad peaks, isobaric compounds, retention time shifts, etc. - - - Check the comment field and lookup for existing entries, for - example ([Figure 5](#fig5)): - - - duplicate mass -\> isobaric compounds - - - check the `missed_substances_for_review.csv` and search - for the `unit mass` - - - compare the retention time - - - copy the `StdName` from the missed table to the field - `newStdName` to rename the tagging of the compound - -In addition a screening of the shapes of the peaks in `MZmine 3` is also -helpful, does keep `MZmine 3` always open until the end of your analysis -([Figure 7](#fig7)). However, the decision is also not always -straightforward, if shapes fit well in all cases ([Figure 8](#fig8)). - -It is not always an easy decision, the `deltart` / `deltamz` and the -`mz_ppm_error` might be good for some records, but the ratios -`QC`/`calibration` does not fit well → in cases of concern ask your -supervisors or discuss with colleagues. - -Mark all duplicates to be removed by setting the value in -`remove_annotation` to `1` ([Figure 5](#fig5)) or re-annotate them by -setting a new name in `new_StdName`.Lookup for other curious compounds -and remove them if you like. - -**Note: If you change to scientific notation, change back to `General` -(`LibreOffice`) or `Standard` (`Excel`) formats to avoid loss of -precision!** In recent versions of `Excel`, it is possible to save data -to `CSV UTF-8 (Comma delimited) (*.csv)`. This is recommended. - - -```{r} -#| fig7, -#| fig.cap = "Peak shapes of duplicate annotations in MZmine 3. An example with straightforward decision.", -#| fig.align = "center", echo = FALSE, out.width = "100%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_7.png") -``` - -```{r} -#| fig8, -#| fig.cap = "Peak shapes of duplicate annotations in MZmine 3. An example, that needs more decision criteria.", -#| fig.align = "center", echo = FALSE, out.width = "90%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_8.png") -``` - -*Optional: Load the environment after running `quantification_workflow(step = 2)`* - -```{r load feature_list_workflow_2.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `feature_list_workflow(step = 2) -load( - file = file.path(project_folder, "MZquant_feature_list_workflow_2.RData"), - envir = .GlobalEnv -) -``` - -To override the revision step in the table, an edited table can be -loaded in the environment. - -```{r read the demonstration target table, echo = TRUE, eval = TRUE, message = FALSE} -# Load the edited `target_table_for_review.csv` -copy_demo_target_table(project_folder = project_folder) -``` - -### Feature list workflow step 3 - -After successful edit of `target_table_for_review.csv`, the table is -loaded and processed in `feature_list_workflow` step 3. This step -updates the `feature list` and prepares the final `feature list` for -further processing. - -```{r feature list workflow step 3, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -# feature list workflow step 3 -feature_list_workflow(step = 3) -``` - -In the case, duplicates are still not annotated, `MZquant` will warn -([Figure 9](#fig9)). In this case, review the -`target_table_for_review.csv` and repeat -`feature_list_workflow(step = 2)`.^[This feature can be also used to quickly check for duplicates.] - - -```{r} -#| fig9, fig.cap = "Remaining duplicate mass compounds.", -#| fig.align = "center", echo = FALSE, out.width = "75%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_9.png") -``` - -*Optional: Load the environment after running `feature_list_workflow(step = 3)`* - -```{r load feature_list_workflow_3.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `feature_list_workflow(step = 3) -load( - file = file.path(project_folder, "MZquant_feature_list_workflow_3.RData"), - envir = .GlobalEnv -) -``` - -The `feature list workflow` is finalized. The workflow generates several -files in the *feature_list* folder for interest users. - -```{r list feature_list folder 2, echo = TRUE, eval = TRUE} -# List files and directories in the feature_list folder -list.files(file.path(project_folder, "results", "feature_lists"), - include.dirs = TRUE -) -``` - -## Blank workflow - -The blank workflow runs the `blank tagging`, if required. If the blank -correction is not necessary, set blank_correction: FALSE in -MZquant_settings.yaml. For details see also the linked functions. - -```{r blank workflow, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -# Blank workflow -blank_workflow() -``` - -`MZquant` integrates two different approaches for blank correction. In -general, a `blank threshold` is calculated of the `blank` values and all -features below this `blank threshold` are deleted of the `feature_list`. - -The first option calculates a simple `blank threshold` based on the -`mean` and the `standard deviation`: - -$$blankthreshold = mean + blankfactor * sd$$ Where `blankfactor` is a -`numerical` factor to control the fold-change of the -`standard deviation` `sd`. - -The second function calculates the `Student t` distribution based -`blank threshold` for each feature based on the -`method detection limit (MDL)` estimation method of US EPA -(@usepa49FR434302011). The method uses a distribution controlled factor -for the addition of the `standard deviation` (`sd`) to the `mean` value: - -Case 1: `n >= minimum number of valid blank values` - -$$blank threshold = mean + qt(p, df = n - 1) * sd$$ - -Where `mean` is the `average` of the blank feature values, `qt` is the -`Students t` density function, `p` is the probability, `df` are the -`degrees of freedom`, `n` is the number of blank values and `sd` is the -`standard deviation`. - -Case 2: `n < minimum number of valid blank values` - -$$blankthreshold = mean + qt(p, df = 1) * sd$$ - -The parameters `p` (`p` = `alpha`) and `n` (`n` = `blank_qt_threshold`) -are set in the *settings file*. - -*Optional: Load the environment after running \`blank_workflow()* - -```{r load blank_workflow.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `blank_workflow()` -load( - file = file.path(project_folder, "MZquant_blank_workflow.RData"), - envir = .GlobalEnv -) -``` - -## Quantification workflow - -The quantification workflow creates the quantification models, enables -trimming of the quantification models and finally performs the -quantification of the targeted compounds. - -### Quantification workflow step 1 - -The `quantification_workflow` step 1 creates the *calibration file -table* `calibration_levels.csv` in ./`results/quantification` for review -and edition of the `calibration levels` and column mapping. The workflow -links the columns with the calibration data with the related -concentration levels. - -```{r quantification workflow 1, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow step 1 -quantification_workflow(steps = 1) -``` - -The tag for the calibration columns is obtained of `MZquant_settings.yaml` from -section `data:standard`. If the tag `data:standard_level_pattern` has -the correct syntax, the calibration levels are filled automatically. - -**For example:** - -**Calibration column name** - -`220101_17_ESIpos_Calib_std_500_ngL` - -**Standard level pattern** - -`data:standard_level_pattern: "Calib_std"` - -The calibration level must be separated by the following patterns by an -underline (`_`). - -**NOTE** - -`MZquant` cannot handle duplicate level annotations in the current -version. In case of several calibration files for one calibration level, -the levels need to be corrected and distinguished by adding a small -decimal to the concentration level, for example `0.001, 0.002, etc.`. - - -| filename | concentration | -|:------------------------------------|--------------------------------:| -| 220101_12_ESIpos_Calib_std_10_ngL | 10 | -| 220101_15_ESIpos_Calib_std_100_ngL | 100 | -| 220101_18_ESIpos_Calib_std_1000_ngL | 1000 | -| 220101_xx_ESIpos_Calib_std_10_ngL | [10.001]{style="color: red;"} | -| 220101_xx_ESIpos_Calib_std_100_ngL | [100.001]{style="color: red;"} | -| 220101_xx_ESIpos_Calib_std_1000_ngL | [1000.001]{style="color: red;"} | - - - - - - - - - - - -*Optional: Load the environment after running `quantification_workflow(step = 1)`* - -```{r load MZquant_quantification_workflow_1.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `quantification_workflow(step = 1)` -load( - file = file.path(project_folder, "MZquant_quantification_workflow_1.RData"), - envir = .GlobalEnv -) -``` - -### Quantification workflow step 2 - -To load the (edited) `calibration_levels.csv` in the environment run the -`quantification_workflow` step 2. In this step, the -`internal standards (IS)` are assigned to each substance feature and the -relative feature heights (or areas) are calculated by normalization to -the related `IS`. The workflow assigns either the nearest IS (mode -`auto` in the *substance file* column `is_used`) or the predefined IS in -`is_used`. - -```{r quantification workflow 2, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 2) -``` - -The assigned internal standards and the IS calibrated data can be -reviewed in the following files. - -```{r list quantification folder 1, echo = TRUE, eval = TRUE} -# List files and directories in the quantification folder -list.files( - file.path(project_folder, "results", "quantification"), - pattern = "_IS_", include.dirs = TRUE -) -``` - -*Optional: Load the environment after running `quantification_workflow(step = 2)* - -```{r load MZquant_quantification_workflow_2.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `quantification_workflow(steps = 2)` -load( - file = file.path(project_folder, "MZquant_quantification_workflow_2.RData"), - envir = .GlobalEnv -) -``` - -*Optional: Review assigned internal standards, reassign internal standards* - -If you are unhappy with the assignments, open the -`substance_Table_IS_assignment_for_review.csv` and add the standardized -name of the internal standard you like to use in the column `IS_used` -and save the file. - -Then run the `quantification_workflows(step = 2)` again to apply the -changes to your dataset. - -```{r quantification workflow 2 redo, echo = TRUE, eval = FALSE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 2, redo = TRUE) -``` - -*Optional: Load the environment after running the IS reassignment* - -```{r load MZquant_quantification_workflow_redo_2.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `quantification_workflow(steps = 2, redo = TRUE)` -load( - file = file.path( - project_folder, - "MZquant_quantification_workflow_redo_2.RData" - ), - envir = .GlobalEnv -) -``` - -### Quantification workflow steps 3-5 - -The `quantification_workflow` steps 3-5 generates the calibration -models, trimms the models, plots the models, and finally quantifies the -features: - -- step 3: generates generalized additive models (GAM) based on the raw - data, exports the feature table and plots the draft models. - -- step 4: magic trimming of the quantification models, applying - automated trimming, and plotting. - -- step 5: applies manual trimming in the trimming table, generates - trimmed models and plots them, and finally estimates the - concentrations in unknown samples, quality controls, and - quantification data and exports the final table. - -It is recommended to run the full steps 3-4 to get the results of -automated trimming. - -```{r quantification workflow 3, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 3) -``` - -```{r quantification workflow 4, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 4) -``` - -*Optional: Load the environments after the last steps* -If something went wrong, try to go back to the previous steps: - -```{r load MZquant_quantification_workflow_3.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `blank_workflow()` -load( - file = file.path(project_folder, "MZquant_quantification_workflow_3.RData"), - envir = .GlobalEnv -) -``` - -```{r load MZquant_quantification_workflow_4.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `blank_workflow()` -load( - file = file.path(project_folder, "MZquant_quantification_workflow_4.RData"), - envir = .GlobalEnv -) -``` - -The purpose of the automated trimming is to limit the calibration models -to a relevant range covering the range of concentrations occurring in -the samples. The idea is to fit the best calibration model possible to -enhance accuracy of the calibration. `MZquant` applies a set of rules to -trim the models (see below). The modeling is based on -`Generalized Additive Models` and includes an automated smoothness -selection (see R package `mgcv` for details). - -However, in many cases an automated trimming is not possible and thus -review and manual trimming of compound is required. The following three -files are required for the manual trimming (in folder -`results/quantification`). - -```{r list peaklist folder 4, echo = TRUE, eval = TRUE} -# List files and directories in the peaklist folder -list.files(file.path(project_folder, "results", "quantification"), - pattern = "trimmed", include.dirs = TRUE -) -``` - -The spreadsheet `quantification_table_trimmed_for_review.csv` is the -main table for reviewing and reprocessing the trimming of the -calibration models. The file -`quantification_table_non-trimmed_for_review.csv` contains the -non-trimmed concentration to relative intensity data. This table is very -useful to restore data, if the original relative intensities have been -manually deleted in the next steps. The pdf-file -`quantification_table_trimmed_model_figs.pdf` includes figs of all -calibration models. - -#### Editing the trimming table - -- Open the `quantification_table_trimmed_model_figs.pdf`. - -- Open the `quantification_table_trimmed_for_review.csv` in your - favorite spreadsheet application. - -- Set the `autofilter` to row 1 ([Figure 10](#fig10)). - -- Insert an empty row above row 1. - -- Hide the first column (`mzquant_id`). - -- Freeze the first column (`stdname`) and the first two rows. - -- Number the calibration columns ("C\_"-columns) beginning from the - columns with the lowest concentration level up to the highest - concentration level. The starting number is generally 3 representing - the position of the lowest concentration level in column C. - -- Select the calibration level columns and the consecutive columns - `in_sample_min` and `in_sample_max`, add a color gradient and set - the format to scientific notation for easier review. - -```{r} -#| fig10, fig.cap = "Prepare trimming table for review.", -#| fig.align = "center", echo = FALSE, out.width = "100%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_10.png") -``` - -It is also recommended to highlight the columns `manual_trim`^[This parameter -controls, if the compound is automatically trimmed(`0`) or not (`1`). -Set it to (`0`) after manual trimming to apply manual trimming. If set to `1`, -trimming is used at all.] and `tracefinder`^[This parameter just is a reminder -evaluate this parameter in a commercial software such as TraceFinder.] -with `1 = red` and `0 = green` as well as the the good (bad) `gam_r2` -([Figure 11](#fig1)). - -```{r} -#| fig11, fig.cap = "Highlight manual\\_trim, tracefinder, and gam\\_r2.", -#| fig.align = "center", echo = FALSE, out.width = "100%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_11.png") -``` -Check now all substances with `0` findings in the samples and set the -manual_trim and the drop_compound columns to `1` ([Figure 12](#fig12)). -The `0` findings could be selected by `NA` (and 0?) in column -`in_sample_num`. In general, the trim and drop columns should be filled -with `1` automatically. - -```{r} -#| fig12, fig.cap = "Select manual\\_trim and tracefinder columns.", -#| fig.align = "center", echo = FALSE, out.width = "90%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_12.png") -``` - -In the next step, select those rows with only 1 finding in the samples -and lookup for the column `min_sample_name` on the right end of the data -block. Select the first sample after the highest calibration level. This -sample often contains carry-over peaks at a very low level. Compare the -in sample minimal level with the lowest calibration levels and remove -those samples, if the sample level is among the low calibration levels -by setting `manual_trim` to `1` to skip trimming for this compound. - -The easy part of the trimming part is done, the more tedious starts now. - -*Trimming and rules* - -The trimming of the calibration is controlled by the settings in -`min_sample_neighbor_pos` and `max_sample_neighbor_pos` represent the -lower and upper margin of the trimmed calibration curve. Of course, the -limit cannot be below the lowest level and vice versa with the highest -level. - -In the best case, the calibration curve is fitted to the next -calibration level below the lowest relative intensity in the samples or -vice verse at the sample maximum. - -This automated trimming fails in the following cases (not exhaustive). - -Triggered by a rule: - -1. The trimmed range contains an empty field^[Missing values are handled - as `NA` internally in `MZquant`.] or zero value. - -2. The calibration data contains local minima and/or maxima. - -3. The calibration data is not monotonic increasing. - -4. The minimum sample intensity is below the lower calibration limit. - -5. The maximum sample intensity is above the upper calibration limit. - ---\> If you find bugs or cases, report them to Tobias Schulze -([tobias.schulze\@ufz.de](mailto:tobias.schulze@ufz.de){.email}), -please. - -*Manual trimming* - -In the mentioned cases (and maybe others), manual trimming of the -calibration data is required. - -Select all samples again by removing the sample name filter and the -sample number filter. In the `in_sample_num` filter, deselect the `NA` / -`0` rows to hide those rows. In addition deselect those rows which are -`1` in drop for a better overview. - -Lookup for the columns `min_sample_neighbor_pos`, -`max_sample_neighbor_pos`, `in_sample_min`, `in_sample_max`, -`calibration_monocity_trim`, `gam_r2`, `localmin_pos` and `localmax_pos` -and decide which rule is valid for the trimming. Check always the -minimal number of calibration points within your calibration margins. - -For manual trimming you must be set the lower calibration position or -the upper calibration position according to the sample margins ([Figure -13](#fig13)). Avoid to delete values outside the margins, this is a -possible cause of errors. The will be deleted in the next processing -step automatically. - -```{r} -#| fig13, -#| fig.cap = "Example of the trimmed range of the calibration levels.", -#| fig.width = 2, fig.align = "center", echo = FALSE, out.width = "90%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_13.png") -``` -Calibration levels within the upper and lower limit of the trimmed -calibration (`min_sample_neighbor_pos`, `max_sample_neighbor_pos`), -could be removed by deleting the value (for example local minima or -local maxima). - -If it is possible to trim the dataset manually, set `manual_trim` to `0` -to apply your settings. If not, you may consider to evaluate in -`TraceFinder` (or another application). Then set the `tracefinder` -column to `1`. You could also comment. The `tracefinder` and -`quantification_comment` is exported in the final output file. - -If something went wrong, open the -`MZquant_samples_non-trimmed_for_review.csv` and just copy the -respective `data block` back to the trimmed data file. - -It is recommended to save the file as an `ods` or `xlsx` file to keep -the formatting, and to export this spreadsheet as -`MZquant_samples_trimmed_for_review.csv`. - -**Important**: Depending on the settings in the spreadsheet application, -the export of the scientific notation might cause loss of decimals and -thus re-format the columns to decimals. In addition, delete the first -column to remove the auxiliary row with the column indexes ([Figure -10](#fig10)) before exporting to the csv file - -*An example in the demonstration data* - -- Lookup for the compound `0003_Phenazone`. - -- Set `min_sample_neighbor_pos` = `5` - -- Set `max_sample_neighbor_pos` = `8` - -- Save the `quantification_table_trimmed_for_review.csv` as described - above. - -To override the trimming in demonstration, an edited table can be loaded -in the environment. - -```{r read the demonstration trimmed table, echo = TRUE, eval = TRUE, message = FALSE} -# Load the edited `target_table_for_review.csv` -copy_demo_trimmed_table(project_folder = project_folder) -``` - -Finally, run the `quantification workflow step 5` to apply the manual -trimming and to quantify. - -```{r quantification workflow 5, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 5) -``` - -The last step can repeated, if you are not happy with the result to try -other manual trimming settings. - -# Appendix - -## Description of the `MZquant_settings.yaml` - -The current format was implemented in `MZquant` version 0.8.1. New -settings or changes will be tagged. - -### Current settings (0.8.1) - -The `MZquant_settings.yaml` is separated in six sections: - -- `settings`: `MZquant_settings.yaml` related parameters - -- `project`: project related parameters - -- `data`: data related parameters - -- `processing`: processing related parameters - -- `quantification`: quantification related parameters - -- `results`: settings to configure final output file with results - -### Section `settings` - -- `version`: The version of the default settings file (do not edit). - The `MZquant` version implementing the current `MZquant_settings.yaml` - format - -### Section `project` - -This section includes the general project settings. - -- `data_file`: Name of the data file containing the `MZmine 3` output. - -- `substances_file`: Name of the file containing the - `substance_table`. - -- `seed`: Integer as seed for random functions. - -- `save_all`: c(`FALSE`, `TRUE`), if `TRUE`, all generated tables are - exported as `csv` for review or debug (default: `FALSE`). - -### Section `data` - -This section contains the parameters describing the tagging of the -fields in the `aligned feature list`, exported by `MZmine 3`. A correct -parametrization is a prerequisite to assign the `samples`, `blanks`, -`quantifications`, and `quality controls` to the correct group and to -avoid biased results. - -- `standard`: Enter the term, which identifies the calibrations - columns (e.g. `Calib`). - -- `standard_level_pattern`: Enter the pattern before the - quantification level, required for automated assignment of - quantification levels (e.g. the correct pattern of - `Calib_water_2018_1000` is `Calib_water_2018`). - -- `quality_control`: Enter the term, defining the `quality control` - samples (e.g. `QC`). - -- `blank`: Enter the term, which identifies `blanks` (e.g. `Blank`). - -- `calibration_blank`: Enter the term, expressing the specific - `calibration blank`, which may contain `internal standards` biasing - the `blank threshold` estimation (e.g. `Calib_std_Blank`). - -- `response`: Enter the response estimation method (c("`area`", - "`intensity`"), default: `area`. - -- `IS_prefix`: Enter the prefix of the internal standard (e.g. - "`IS"`). - -- `meta_mzmine`: Metadata fields to be included from `MZmine 3` export - (default). An updated file will be written to the results folder. - Including the selected fields and feature information (area or - height). The values must be quoted, delimeted by comma, and - emphasized in square brackets, e.g.: ["id", "mz", "mz_range:min", - "mz_range:max", "rt", "rt_range:min", "rt_range:max", - compound_db_identity:compound_db_identity", - compound_db_identity:compound_annotation_score", - "compound_db_identity:mol_formula", - "compound_db_identity:precursor_mz", - "compound_db_identity:mz_diff_ppm", - "compound_db_identity:rt", - "manual_annotation:identity", - "manual_annotation:comment", - "manual_annotation:compound_name"]. - -- `meta_substances`: Metadata fields to be included from substance - file (default).. An updated file will be written to the results - folder. Including the selected fields and feature information (area - or height). The values must be quoted, delimited by comma, and - emphasized in square brackets, e.g.["compound", - "mzquant:compound_class", "comment", "adduct", "mzquant:mode", - "mzquant:prim_sec"]. - -### Section `processing` - -This section defines the automated `feature_list_workflow` and -`blank_workflow` processing. - -- `meta_substances_to_feature_list`: c(`TRUE`, `FALSE`), if `TRUE`, - the meta substances fields are added to revised MZmine 3 table for - review (default: `TRUE`). - -- `feature_class`: Enter the class which should be processed in blank - correction c("`tagged`","`all`"), be careful, `all` will last longer - time due to unprofessional implementation so far. - -- `blank_correction`: c(`TRUE`, `FALSE`), if `FALSE`, the features are - only tagged with blank thresholds, but not corrected (default: - `TRUE`) - -- `blank_correction_class`: Enter c("`samples`", "`all`") to select - the case of `blank_tagging`: - - - case 1: `samples` - only the features above the - `blank_threshold` are eliminated in samples. - - - case 2: `all` - features above the `blank_threshold` are - eliminated in calibrations and samples. - -- `blank_method`: Method for the blank threshold estimation with - c("`default`", "`qt`"): - - - `default`: `mean`(blanks) + `factor` \* `SD`(blanks) - - - `qt`: `mean`(blanks) + `qt`(`probability`, `degrees of freedom`) - \* SD(blanks)) - -- `blank_qt_alpha`: alpha or probability p for the qt estimation - (default: `0.99`). - -- `blank_qt_threshold`: Minimum number of valid blank features for the - use the `qt` estimation (default: `3`). - -- `blank_factor`: Multiplier of the `SD` for the default blank - correction method (default: `2`). - -### Section `quantification` - -This section contains the parameters for the `quantification_workflow`. - -- `min_cal_points`: Enter the minimum of calibration points required - (default: `4`). - -- `in_cal_neighbors`: Enter the minimum of calibration points of the - minimum sample on the lower end (default: `0`). - -- `max_cal_neighbors`: Enter the minimum of calibration points of the - maximum sample on the upper end (default: `0`) - -- `low_fig_scale_factor`: Enter the scale factor for fitting the low - level calibration range. It allows a better visual inspection of the - low calibration range fits, typical values are `0.1` or `0.05` - (default: `0.05`). - -- `IS_impute_method`: Enter the method for the `IS gap filling`. Use - with caution, it is more an experimental method and should be - mentioned, if used in a productive environment. The default method - `KNN` imputes missing internal standards by k-nearest neighbor - imputation. Other methods are considered as not reliable and biased. - -- `colmax`: Minimum of non missing data in columns passed to the - `impute.KNN::impute` function, expressed in decimals (default: - `0.8`). - -- `rowmax`: Minimum of non missing data in rows passed to the - `impute.KNN::impute` function, expressed in decimals (default: - `0.8`). - -- `IS_method`: Enter the method for the IS assignment for the - calculation of the concentration ratios with c("`SUBTAB`", - "`SAMPTAB`") (default: `SUBTAB`) - -- \`SUBTAB : The RTs of the substance_table are used to assign the - nearest internal standard. - -- `SAMPTAB`: The RTs of the samples table are used to assign the - nearest internal standard. - -- `n_localminmax`: Number of points which define the range of local - minima and maxima (default: `2`). - -- `plot_names`: Plot the names of the compounds during modelling - (helpful for debugging of failing compounds) with c(`FALSE`, `TRUE`) - (default: `FALSE`) - -### Section `results` - -This section includes the settings for the final results file. - -- `digits`: Enter number of digits for the output of decimals - (default: `6`). - -- `unit`: Enter the unit of the final data (default: `ng/L`) - -- `result_metadata`: - -- Selection of metadata columns to be includes in the final output. - Could be any from `MZmine 3` output, `substance_table`, - `target_table_for_review`, or `trimmed_for_review`. - -- Optional: The columns can be sorted in your preferred order. - -- **Warning**: The columns related to `MZmine 3` and the - `substance_table` must be included in the tags `data:meta_mzmine` - and `data:meta_substances`! - -- The values must be quoted, delimited by comma, and emphasized in - square brackets (default: ["compound_id", "compound", - "mzquant:compound_class", "row_mz", "row_rt", - "compound_db_identity:compound_annotation_score", - "compound_db_identity:precursor_mz", "row_mz_delta_ppm", - "manual_annotation:comment", "comment", "adduct", "mzquant:mode", - "mzquant:prim_sec", "mzquant:class", "gam_r2", "tracefinder", - "quantification_comment"]) - -- **Caution**: Some of the mzmine columns were replaced by names in - column 1. In these cases, the new name must be entered: - -- `mzquant_id` = "`id`" - -- `row_mz` = "`mz`" - -- `row_mz_min` = "`mz_range:min`" - -- `row_mz_max` = "`mz_range:max`" - -- `row_rt` = "`rt`" - -- `row_rt_min` = "`rt_range:min`" - -- `row_rt_max` = "`rt_range:max`" - -- `row_mz_delta_ppm` = "`compound_db_identity:mz_diff_ppm`" - -- `compound_id` = "`compound_db_identity:compound_db_identity`" - -- `result_classes`: Classes of sample groups quantified and passed to - the final output with c("`blanks`", "`qc`", "`quantification`", - "`samples`"). - -- The values must be quoted, delimited by comma, and emphasized in - square brackets (default: ["qc", "quantification", "samples"]). - -# SessionInfo -```{r sessioninfo, echo = TRUE, eval = TRUE, message = FALSE} -sessionInfo() -``` - -# References - -```{r clean_up, echo = FALSE, results = "asis", eval = FALSE} -unlink(project_folder, recursive = TRUE) -``` +--- +title: "REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII files" +shorttitle: "REcoTox (version `r desc::desc_get_version()`)" +author: Tobias Schulze + | Helmholtz Centre for Environmental Research - UFZ, Leipzig, Germany + | tsufz1@gmail.com +date: 2023-11-04 +output: + BiocStyle::html_document: + toc: true + toc_float: TRUE +vignette: > + %\VignetteIndexEntry{REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII files} + %\VignetteKeywords{E} + %\VignettePackage{REcoTox} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteDepends{BiocStyle, desc} +bibliography: references.bib +csl: biomed-central.csl +editor_options: + markdown: + wrap: 72 +--- + +```{r biocstyle, echo = FALSE, messages = FALSE, results = "hide"} +BiocStyle::markdown() +``` + +```{r init, message = FALSE, echo = FALSE, results = "hide" } +## Silently loading all packages +library(BiocStyle) +library(desc) +library(kableExtra) +library(tidyverse) +``` + +\newpage + +# Background + +The search and extraction of experimental ecotoxicological information +is often a tedious work. A good and comprehensive data source is the [US +EPA ECOTOX +Knowledgebase](https://cfpub.epa.gov/ecotox/ "US EPA ECOTOX Knowledgebase"). +It contains more than 1 million data points for almost 13,000 chemicals +and 14,000 single species. However, for a high-throughput hazard +assessment, it is not possible to extract all relevant data of the +online database. The purpose of REcoTox is to extract the relevant +information and to aggregate the data based on the user criteria out of +the entire database [ASCII +files](https://gaftp.epa.gov/ecotox/ecotox_ascii_03_10_2022.zip "ECOTOX Knowledgebase ASCII files"). + +# Introduction + +[REcoTox](https://github.com/tsufz/REcoTox) is a semi-automated, +interactive workflow to process [US EPA ECOTOX +Knowledgebase](https://cfpub.epa.gov/ecotox/ "US EPA ECOTOX Knowledgebase") +entire database [ASCII +files](https://gaftp.epa.gov/ecotox/ecotox_ascii_03_10_2022.zip "ECOTOX Knowledgebase ASCII files") +to extract and process ecotoxicological data relevant (but not +restricted) to the ecotoxicity groups algae, crustaceans, and fish in +the aquatic domain. The focus is aquatic ecotoxicity and the unit of the +retrieved data is `mg/L`. + +# Input files and folders + +`REcoTox` requires an unzipped `US EPA Knowlegdebase` database in +`ASCII` format (Zitat). The database is preferable expanded in an own +database folder to be defined during the processing, The database +consists of relatively referenced text files. The separator of the data +is the pipeline `|` symbol. + +In the first session of `REcoTox`, a file `chemical_properties.csv` is +created in the database folder. This files contains chemical identifiers +and chemical properties required for the processing of the chemical data +in the knowlegdebase and to tag the results. + +The chemical property file is dynamically updated and requires also some +manual curation. It will grow as soon new chemicals are added to the +knowledgebase. + +The `project_folder` contains the `R` script for processing as well as +the intermediate and final processing files. The naming of the folder is +arbitrary, but do not use spaces, but underscores (`_`) or hyphens (`-`) +for separating parts. + +To run the queries, a predefined processing script is available on +`GitHub` +([`Query_EcoTox_DB.R`](https://github.com/tsufz/REcoTox/blob/main/inst/extdata/Query_Ecotox_DB.R)) +or in the local `REcoTox` package folder. + +\newpage + +# Using REcoTox + +The following tutorial explains the different steps of `REcoTox` in a +comprehensive demonstration. `REcoTox` includes different interactive +steps, which require the evaluation of comma separated text files +(`*.csv`) in an external spreadsheet application (preferable LibreOffice +[@LibreOffice]). + +## Load the REcoTox package + +```{r load REcoTox package, eval = FALSE, echo = TRUE, message = FALSE, warning = FALSE} +# Load the REcoTox package +library(REcoTox) +``` + +## Documentation for MZquant + +A detailed description of all functions of `REcoTox` functions is +available in the `R Documentation`. + +```{r R Documentation, echo = TRUE, eval = FALSE} +# Documentation of REcoTox +help(package = "REcoTox") +``` + +## Preparation of the working environment (for beginners) + +The processing in `REcoTox` is interactivally controlled by a processing +script `Query_EcoTox_DB.R`. + +If you run `REcoTox` for the first time, a tutorial project is available +to demonstrate all important steps of `REcoTox` processing. The +following script is preparing an example folder in your home directory +and copies all necessary files in the folder. + +```{r initialize folders, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} +# Path of the project folder +project_folder <- "REcoTox_demo" + +database_folder <- system.file("extdata/database_folder", package="REcoTox") +# The project folder is created in the home directory +project_path <- normalizePath(ifelse(.Platform$OS.type == "unix", + paste0("~/", project_folder), + paste0( + Sys.getenv("HOMEPATH"), + "\\", + project_folder + ) +)) + +# An existing folder is deleted +if (dir.exists(project_folder)) { + unlink(project_folder, recursive = TRUE) +} +``` + +This command initializes the project folder and the database folder. It +copies also the processing script to the project folder. + +```{r create project, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} +project <- REcoTox::create_project(database_path = database_folder, + project_path, + initalise_database_project = TRUE, # create the basic project from current ASCII files in DB folder + initalise_project = TRUE, # initializes the project folder + load_default = FALSE) # loads the default project in the project folder in the memoryfault_example = TRUE + +file.copy( + from = system.file( + "extdata", + "Query_EcoTox_DB.R", + package = "REcoTox" + ), + to = normalizePath( + path = file.path( + project_folder, + "Query_EcoTox_DB.R" + ), + winslash = "\\", + mustWork = FALSE + ), + overwrite = TRUE + ) + +``` + +The `project_folder` contains the following files: + +```{r list project folder} +# List files and directories in project_folder +list.files(project_folder, recursive = TRUE, include.dirs = TRUE) +``` + +The `database_folder` contains the following files and folders: +`chemical_properties.csv` is the file containing the curated chemical +properties, `results.txt` contains the testing results collected in the +knowledgebase, and `test.txt` contains the the metadate of the tests. + +The folder `validation` contains the files `chemicals.txt` with chemical +information, the file `references.txt` contains the references and +`species.txt` the species. + +```{r list database folder} +# List files and directories in project_folder +list.files(database_folder, recursive = TRUE, include.dirs = TRUE) +``` + +It contains only the `Query_EcoTox_DB.R` file. + +## Review of the input data + +To review the input data, let us look in the data: + +```{r view chemical_properties, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the chemical properties +chemical_properties <- readr::read_csv(file = normalizePath(path = file.path( + database_folder, + "chemical_properties.csv" +), ), show_col_types = FALSE) + +kable( + chemical_properties %>% + head(5), + format = "html", + digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") +``` + +```{r view results, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the result table +results <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "results.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + results %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") +``` + +```{r view chemicals, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the substance_table +substances <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "chemicals.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + substances %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") +``` + +```{r view references, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the substance_table +references <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "references.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + references %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") +``` + +```{r view species, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the substance_table +species <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "species.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + species %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") +``` + +## Preparation of the database environment and initialisation of the project folder + +In the first step, the function `create_project` creates the database project and +initializes the database and project folders: + +1. Load the ASCII files. + +2. Create a file `chemical_properties.csv` based o the `chemicals.txt` table. +If this table exists, it is loaded. + +3. Store the initial database project in `project.Rdata` in the database folder. + +4. Store the initial projcet in `initial_project.Rdata` in the project folder. + +### Parameters + +- `initialise_database_project` (TRUE/FALSE): Creates the basic database project +from the current ASCII files in the database folder and (if not existing) stores +the `chemical_properties.csv` in the database folder. + +- `initialise_project` (TRUE/FALSE): Stores the `REcoTox` environment in an initial +`Rdata` object named `initial_project.Rdata` in the project folder. + +- `load_default` (TRUE/FALSE): Loads an existing basic database project from the +database folder and stores it in the project. + +`chemical_properties.csv`: This tables contains the internal chemical ID `cas_number` (i.e. +the cas number in integer format) and related user-curated metadata (e.g., +chemical identifiers such as InChIKey, or PubChem CIDs) and chemical property +data (i.e. log S values). It will be re-used and extended in future +analyses to minimize curation efforts. If this file exists, it will be loaded +to the project environment. Because the `chemicals.txt` table only contains `CAS` numbers +in integer format, a regular CAS number is added (e.g., 1912-24-9 for 1912249). + +```{r initialize databases, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::create_project(database_path = database_folder, + project_path, + initalise_database_project = TRUE, + initalise_project = TRUE, + load_default = FALSE) +``` + +## Preparation of the initial project +In the second step, utilizing the function `prepare_data`, the tables' `test`, +`results`, `species`, `chemicals`, and `references` are joined IDs `test_id`, +`cas_number`, `species_number`, and `reference_number`. The initial environment +is stored in the file `ìnitial_project.Rdata` in the project folder. +This file will be the same for all analyses related to a database revision. +Thus, it could be copied from another project to avoid rerunning initial steps. + +### Parameters + +- `project`: Name of the project environment. + +- `load_initial_project` (TRUE/FALSE): Loads the `initial_project.Rdata` of the +project folder. + +- `new_project_path`: The `initial_project.Rdata` contains the project folder +path where it was initially created. For example, in case of moving the project +folder or if the `initial_project.Rdata` was copied from another folder, it is +required to set a new project path. + +- `save_project` (TRUE/FALSE): Save the `initial_project.Rdata`. For example, +in case, the project folder was renewed. + +```{r initialize project, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::prepare_data(project = project, + load_initial_project = FALSE, + new_project_path = NA, + save_project = TRUE +) +``` + +## Processing the data +In the third step, the function `process_data` reads the following settings to +query the database accordingly. + +A list of relevant endpoints (e.g., EC50) and all relevant species are generated +and exported to the project folder for review. +The two files are `ecotoxgroup_endpoint_selection.csv` and +`ecotoxgroup_species_selection.csv`. The review could be performed in any +spreadsheet program. The data must be stored in the `comma` delimited format! + +The former table contains a field `include_endpoint`, this controls the inclusion +of each endpoint by setting the value to 0 or 1 (0 = not included, 1 = included). +Other values are not accepted, and the import of the file in the next processing +step will be declined. The value 0 is the default, and thus, the endpoints to +be included should be marked with 1. + +The latter table contains a field include_species, this controls the inclusion of each species by setting the value to 0 or 1 (0 = not included, 1 = included). Depending on the settings of species_selection, the preset is different: + +- `include_species` is set to 1 + +- `include_species` is set to 0 + +- `include_species` is set to 1 for standard test species and set to 0 for other species + +Review and edit the tables in a preferred spreadsheet program. If changed, +save the changes in the same file. The separator must be `comma`. + +In this step, the database is queried to select the datasets related to the +goals of the analysis. + +The queries can be controlled by the following parameters: + +- `dosing_group`: Specifies the compartment to which the dosing is referenced (so far only "water_concentration", i.e. result is mg/L) + +- `duration_d`: Duration of the exposure in days (e.g. `d`, `dph`, `dpf`) + +- `duration_h`: Duration of the exposure in hours (e.g. `h`, `ht`, `hph`, `hpf`, `hbf`, `hv`) + +- `duration_m`: Duration of the exposure in minutes (e.g. `mi`) + +- `ecotoxgroup`: Species group (e.g. `Algae`, `Crustacean`, `Fish`) + +- `effects`: Effect endpoints (e.g. `MOR`, `GRO`, `DEV`) + +- `habitat`: Habitat of the ecotoxgroup (i.e. `Non-Soil`, `Water`, `Soil`) + +- `kingdoms`: Specification of the `algae` kingdoms (e.g. `Chromista`, `Plantae`, `Monera`) + +- `measurements`: Specification of specific measurements + +- `min_h`: Minimum duration of the experiment in hours + +- `max_h`: Maximum duration of the experiment in hours + +- `min_d`: Minimum duration of the experiment in days + +- `max_d`: Maximum duration of the experiment in days + +- `min_m`: Minimum duration of the experiment in minutes + +- `max_m`: Maximum duration of the experiment in minutes + +- `species_seleciton`: Selection of species (i.e. `all`, `manual`, `standard_test_species`) + +Where `all` selects all species of an `ecotoxgroup`, `manual` expects manual selection in +the files mentioned above and `standard_test_species` selects only species marked as +standardized species. + +### Filtering the data +In the processing step 1, the data in the database is filtered based on the settings +to extract relevant data of the database. + +```{run step 1, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} + +# set the parameters +dosing_group = "water_concentration" # i.e. mg/L (only available group in this version) +duration_d = c("d", "dph", "dpf") +duration_h = c("h", "ht", "hph", "hpf", "hbf", "hv") +duration_m = "mi" +ecotoxgroup = "Algae" # c("Algae", "Crustacean", "Fish") +effects = c("MOR", "GRO", "POP", "REP", "MPH", "DEV") # Algae/Fish +#effects = c("MOR", "GRO", "POP", "REP", "MPH", "DEV", "ITX") # Crustacean +habitat = "Water" #c("Non-Soil","Water","Soil") +kingdoms = NA # vector of specific algae kingdoms: c("Chromista","Plantae","Monera") +measurements = NA # vector of specific measurements +min_h = 0 +min_d = 0 +max_h = 120 +max_d = 5 +min_m = 0 +max_m = 7200 +species_selection = "all" # c("all", "manual", "standard_test_species") + +# run the processing step +project <- REcoTox::process_data(project, + dosing_group = dosing_group, + duration_d = duration_h, + duration_h = duration_h, + duration_m = duration_m, + ecotoxgroup = ecotoxgroup, + effects = effects, + habitat = habitat, + kingdoms = kingdoms, + measurements = measurements, + max_d = max_d, + min_d = min_d, + max_h = max_h, + min_h = min_h, + max_m = max_m, + min_m = min_m, + remove_formulation = FALSE, + save_project_steps = FALSE, + species_selection = species_selection +) + +``` + +This step stores two files in the `project_folder`, `ecotoxgroup_species_selection.csv` +and `ecotoxgroup_endpoint_selection.csv`. The first block of the file is related to the +ecotoxgroup specified. The species selection file contains all species extracted +for review and the endpoint the respective endpoints (e.g. EC50). To include +species or endpoints, mark the data with `1`, otherwise to exclude, mark with `0`. + +### Filtering species and endpoints + +After review and saving the files, run the following command. This command +reads the files and the data is filtered accordingly. + +The units in the database are quite divergent and thus a unit conversion is performed +to transform all units and values to `mg/L`. In case of mol related units, +the transformation is automated so far the chemical and the molecular weight is +already in the database. If not, the file `ecotoxgroup_mol_weight.csv` is exported +to the `project_folder`. + +```{run step 2, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::process_data(project, save_project_steps = FALSE) + +``` + +### Unit conversion + +Review and edit the file `ecotoxgroup_mol_weight.csv` to add the molecular weight to +the list. The ecotoxicity data is interactivitely enriched with chemical information +(e.g. the average mass). + +In best case with data linked to [US EPA CompTox Chemicals Dashboard](https://comptox.epa.gov/dashboard/ "US EPA CompTox Chemicals Dashboard") +for example by using the output of the [batch search](https://comptox.epa.gov/dashboard/batch-search "US EPA CompTox Chemicals Dashboard Batch Search") +according to Figure 1 and Figure 2. + +![Figure1: US EPA CompTox Chemicals Dashboard Batch Search - Enter Identifiers to Search](./figures/Figure_1.png "Figure 1: US EPA CompTox Chemicals Dashboard Batch Search - Enter Identifiers to Search") + +![Figure 2: US EPA CompTox Chemicals Dashboard Batch Search - Recommended selection of identifiers and properties](./figures/Figure_2.png "Figure 2: US EPA CompTox Chemicals Dashboard Batch Search - Recommended selection of identifiers and properties") + +After update of the mol weight table, run the following command to finalise the +unit conversion step. + +```{run step 3, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::process_data(project, save_project_steps = FALSE) + +``` +### Chemical properties data and final processing + +The former processing step creates a file named `ecotoxgroup_chemical_list.csv`. +Edit this list to include newly added compounds (imputation of phys.-chem. +properties and metadata). + +To score the quality of the data, the solubility domain of the result is +calculated. The calculation requires the experimental or predicted solubility +of the chemical. + +```{run step 4, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::process_data(project, save_project_steps = FALSE, update_chemicals = FALSE) + +``` + +The file `ecotoxgroup_final_results.csv` is stored in the `project_folder`. +It contains the results of the processing in the long pivot format. + + +### Preparation of the wide pivot table with the aggregated ecotoxicity information + +For final processing and to aggregate the data in the wide pivot format, +run the following final step. + +```{run step 5, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::aggregate_results(project = project, quantile = 0.05) + +``` + +``{r view references, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the substance_table +references <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "references.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + references %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") +``` + +An example of the final data: + +```{r view pivot tables, echo = FALSE, eval = TRUE, message = TRUE} +# Review of the privot table +pivot <- + project$object$results_pivot + +kable( + pivot %>% + head(5), + format = "html", digits = 2 +) %>% +kable_styling("striped", full_width = TRUE) %>% +scroll_box(width = "700px", height = "300px") +``` + +\newpage + +# SessionInfo + +```{r sessioninfo, echo = TRUE, eval = TRUE, message = FALSE} +sessionInfo() +``` + +\newpage + +# References + +```{r clean_up, echo = FALSE, results = "asis", eval = FALSE} +unlink(project_folder, recursive = TRUE) +``` diff --git a/vignettes/REcoTox.html b/vignettes/REcoTox.html new file mode 100644 index 0000000..aa95f07 --- /dev/null +++ b/vignettes/REcoTox.html @@ -0,0 +1,5901 @@ + + + + + + + + + + + + + + + +REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII files + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + +
+
+
+
+
+ +
+ + + + + + + +
+
+

1 Background

+

The search and extraction of experimental ecotoxicological information +is often a tedious work. A good and comprehensive data source is the US +EPA ECOTOX +Knowledgebase. +It contains more than 1 million data points for almost 13,000 chemicals +and 14,000 single species. However, for a high-throughput hazard +assessment, it is not possible to extract all relevant data of the +online database. The purpose of REcoTox is to extract the relevant +information and to aggregate the data based on the user criteria out of +the entire database ASCII +files.

+
+
+

2 Introduction

+

REcoTox is a semi-automated, +interactive workflow to process US EPA ECOTOX +Knowledgebase +entire database ASCII +files +to extract and process ecotoxicological data relevant (but not +restricted) to the ecotoxicity groups algae, crustaceans, and fish in +the aquatic domain. The focus is aquatic ecotoxicity and the unit of the +retrieved data is mg/L.

+
+
+

3 Input files and folders

+

REcoTox requires an unzipped US EPA Knowlegdebase database in +ASCII format (Zitat). The database is preferable expanded in an own +database folder to be defined during the processing, The database +consists of relatively referenced text files. The separator of the data +is the pipeline | symbol.

+

In the first session of REcoTox, a file chemical_properties.csv is +created in the database folder. This files contains chemical identifiers +and chemical properties required for the processing of the chemical data +in the knowlegdebase and to tag the results.

+

The chemical property file is dynamically updated and requires also some +manual curation. It will grow as soon new chemicals are added to the +knowledgebase.

+

The project_folder contains the R script for processing as well as +the intermediate and final processing files. The naming of the folder is +arbitrary, but do not use spaces, but underscores (_) or hyphens (-) +for separating parts.

+

To run the queries, a predefined processing script is available on +GitHub +(Query_EcoTox_DB.R) +or in the local REcoTox package folder.

+
+
+
+

4 Using REcoTox

+

The following tutorial explains the different steps of REcoTox in a +comprehensive demonstration. REcoTox includes different interactive +steps, which require the evaluation of comma separated text files +(*.csv) in an external spreadsheet application (preferable LibreOffice +[1]).

+
+

4.1 Load the REcoTox package

+
# Load the REcoTox package
+library(REcoTox)
+
+
+

4.2 Documentation for MZquant

+

A detailed description of all functions of REcoTox functions is +available in the R Documentation.

+
# Documentation of REcoTox
+help(package = "REcoTox")
+
+
+

4.3 Preparation of the working environment (for beginners)

+

The processing in REcoTox is interactivally controlled by a processing +script Query_EcoTox_DB.R.

+

If you run REcoTox for the first time, a tutorial project is available +to demonstrate all important steps of REcoTox processing. The +following script is preparing an example folder in your home directory +and copies all necessary files in the folder.

+
# Path of the project folder
+project_folder <- "REcoTox_demo"
+
+database_folder <- system.file("extdata/database_folder", package="REcoTox")
+# The project folder is created in the home directory
+project_path <- normalizePath(ifelse(.Platform$OS.type == "unix",
+    paste0("~/", project_folder),
+    paste0(
+        Sys.getenv("HOMEPATH"),
+        "\\",
+        project_folder
+    )
+))
+
+# An existing folder is deleted
+if (dir.exists(project_folder)) {
+    unlink(project_folder, recursive = TRUE)
+}
+

This command initializes the project folder and the database folder. It +copies also the processing script to the project folder.

+
project <- REcoTox::create_project(database_path = database_folder,
+                          project_path,
+                          initalise_database_project = TRUE, # create the basic project from current ASCII files in DB folder
+                          initalise_project = TRUE, # initializes the project folder
+                          load_default = FALSE) # loads the default project in the project folder in the memoryfault_example = TRUE
+
+file.copy(
+                    from = system.file(
+                        "extdata",
+                        "Query_EcoTox_DB.R",
+                        package = "REcoTox"
+                    ),
+                    to = normalizePath(
+                        path = file.path(
+                            project_folder,
+                            "Query_EcoTox_DB.R"
+                        ),
+                        winslash = "\\",
+                        mustWork = FALSE
+                    ),
+                    overwrite = TRUE
+                )
+
## [1] FALSE
+

The project_folder contains the following files:

+
# List files and directories in project_folder
+list.files(project_folder, recursive = TRUE, include.dirs = TRUE)
+
## character(0)
+

The database_folder contains the following files and folders: +chemical_properties.csv is the file containing the curated chemical +properties, results.txt contains the testing results collected in the +knowledgebase, and test.txt contains the the metadate of the tests.

+

The folder validation contains the files chemicals.txt with chemical +information, the file references.txt contains the references and +species.txt the species.

+
# List files and directories in project_folder
+list.files(database_folder, recursive = TRUE, include.dirs = TRUE)
+
## [1] "chemical_properties.csv"   "project.RData"            
+## [3] "results.txt"               "tests.txt"                
+## [5] "validation"                "validation/chemicals.txt" 
+## [7] "validation/references.txt" "validation/species.txt"
+

It contains only the Query_EcoTox_DB.R file.

+
+
+

4.4 Review of the input data

+

To review the input data, let us look in the data:

+
# Review of the chemical properties
+chemical_properties <- readr::read_csv(file = normalizePath(path = file.path(
+    database_folder,
+    "chemical_properties.csv"
+), ), show_col_types = FALSE)
+
+kable(
+    chemical_properties %>%
+        head(5),
+    format = "html",
+    digits = 2
+) %>%
+kable_styling("striped", full_width = TRUE) %>% 
+scroll_box(width = "700px", height = "300px")
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+cas_number + +cas + +chemical_name + +dtxsid_ecotox + +PubChem_CID + +FOUND_BY + +DTXSID_DTX + +PREFERRED_NAME + +CASRN + +INCHIKEY + +IUPAC_NAME + +SMILES + +INCHI_STRING + +QSAR_READY_SMILES + +MOLECULAR_FORMULA + +AVERAGE_MASS + +MONOISOTOPIC_MASS + +QC_LEVEL + +LOG_S + +LOG_S_AD + +S_AD_index + +LOG_S_COMMENT + +EXCLUDE + +REMARKS +
+1912249 + +1912-24-9 + +6-Chloro-N-ethyl-N’-(1-methylethyl)-1,3,5-triazine-2,4-diamine + +DTXSID9020112 + +2256 + +DSSTox_Substance_Id + +DTXSID9020112 + +Atrazine + +1912-24-9 + +MXWJVTOOROXGIU-UHFFFAOYSA-N + +6-Chloro-N2-ethyl-N4-(propan-2-yl)-1,3,5-triazine-2,4-diamine + +CCNC1=NC(NC(C)C)=NC(Cl)=N1 + +InChI=1S/C8H14ClN5/c1-4-10-7-12-6(9)13-8(14-7)11-5(2)3/h5H,4H2,1-3H3,(H2,10,11,12,13,14) + +CCNC1N=C(Cl)N=C(NC(C)C)N=1 + +C8H14ClN5 + +215.69 + +215.09 + +1 + +-3.7 + +1 + +1 + +OPERA_2.9 + +NA + +NA +
+
+
# Review of the result table
+results <-
+    readr::read_delim(
+        file = normalizePath(
+            path = file.path(
+                database_folder,
+                "results.txt"
+            ),
+        ),
+        show_col_types = FALSE,
+        delim = "|"
+        
+    )
+
+kable(
+    results %>%
+        head(5),
+    format = "html", digits = 2
+) %>%
+kable_styling("striped", full_width = TRUE) %>% 
+scroll_box(width = "700px", height = "300px")
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+result_id + +test_id + +sample_size_mean_op + +sample_size_mean + +sample_size_min_op + +sample_size_min + +sample_size_max_op + +sample_size_max + +sample_size_unit + +sample_size_comments + +obs_duration_mean_op + +obs_duration_mean + +obs_duration_min_op + +obs_duration_min + +obs_duration_max_op + +obs_duration_max + +obs_duration_unit + +obs_duration_comments + +endpoint + +endpoint_comments + +trend + +effect + +effect_comments + +measurement + +measurement_comments + +response_site + +response_site_comments + +effect_pct_mean_op + +effect_pct_mean + +effect_pct_min_op + +effect_pct_min + +effect_pct_max_op + +effect_pct_max + +effect_pct_comments + +conc1_type + +ion1 + +conc1_mean_op + +conc1_mean + +conc1_min_op + +conc1_min + +conc1_max_op + +conc1_max + +conc1_unit + +conc1_comments + +conc2_type + +ion2 + +conc2_mean_op + +conc2_mean + +conc2_min_op + +conc2_min + +conc2_max_op + +conc2_max + +conc2_unit + +conc2_comments + +conc3_type + +ion3 + +conc3_mean_op + +conc3_mean + +conc3_min_op + +conc3_min + +conc3_max_op + +conc3_max + +conc3_unit + +conc3_comments + +bcf1_mean_op + +bcf1_mean + +bcf1_min_op + +bcf1_min + +bcf1_max_op + +bcf1_max + +bcf1_unit + +bcf1_comments + +bcf2_mean_op + +bcf2_mean + +bcf2_min_op + +bcf2_min + +bcf2_max_op + +bcf2_max + +bcf2_unit + +bcf2_comments + +bcf3_mean_op + +bcf3_mean + +bcf3_min_op + +bcf3_min + +bcf3_max_op + +bcf3_max + +bcf3_unit + +bcf3_comments + +significance_code + +significance_type + +significance_level_mean_op + +significance_level_mean + +significance_level_min_op + +significance_level_min + +significance_level_max_op + +significance_level_max + +significance_comments + +chem_analysis_method + +chem_analysis_method_comments + +endpoint_assigned + +organism_final_wt_mean_op + +organism_final_wt_mean + +organism_final_wt_min_op + +organism_final_wt_min + +organism_final_wt_max_op + +organism_final_wt_max + +organism_final_wt_unit + +organism_final_wt_comments + +intake_rate_mean_op + +intake_rate_mean + +intake_rate_min_op + +intake_rate_min + +intake_rate_max_op + +intake_rate_max + +intake_rate_unit + +intake_rate_comments + +lipid_pct_mean_op + +lipid_pct_mean + +lipid_pct_min_op + +lipid_pct_min + +lipid_pct_max_op + +lipid_pct_max + +lipid_pct_comments + +dry_wet + +dry_wet_pct_mean_op + +dry_wet_pct_mean + +dry_wet_pct_min_op + +dry_wet_pct_min + +dry_wet_pct_max_op + +dry_wet_pct_max + +dry_wet_pct_comments + +steady_state + +additional_comments + +companion_tag + +created_date + +modified_date + +old_terretox_result_number +
+198582 + +1212738 + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +72 + +NA + +NA + +NA + +NA + +h + +NA + +EC50 + +NA + +DEC + +POP + +NA + +PGRT + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +F + +NA + +NA + +3367.9 + +NA + +NA + +NA + +NA + +ug/L + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +U + +NA + +P + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +LIFESTG/STRAIN// + +NA + +01/19/2001 + +NA + +NA +
+2176441 + +2095522 + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +72 + +NA + +NA + +NA + +NA + +h + +NA + +LOEC + +NA + +DEC + +BCM + +NA + +FLRS/ + +NON-PHOTOCHEMICAL QUENCHING + +NA + +NA + +NA + +NA + +> + +60 + +< + +80 + +OF CONTROL, FROM GRAPH + +F + +NA + +NA + +250 + +NA + +NA + +NA + +NA + +nM + +DATA NOT PRESENTED FOR 25, 65, 125, 500, 1000, 2000 AND 10000 NM CONCS + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +SIG + +P + +< + +0.05 + +NA + +NA + +NA + +NA + +NA + +U + +NA + +R + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +EFCT%/OF CONTROL, FROM GRAPH//CONC1/DATA NOT PRESENTED FOR 25, 65, 125, 500, 1000, 2000 AND 10000 NM CONCS// + +NA + +02/07/2014 + +02/07/2014 + +NA +
+713354 + +1262587 + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +24 + +NA + +NA + +NA + +NA + +h + +NA + +EC50 + +NA + +DEC + +POP + +NA + +ABND/ + +AUTHOR REPORTED CULTURE POPULATION CHANGE AS INHIBITION OF ALGAL REPRODUCTION + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +A + +NA + +NA + +0.082 + +NA + +0.07 + +NA + +0.09 + +umol/L + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +U + +NA + +P + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +LIFESTG/KRAUSS, STRAIN 211-15, GROWN PHOTOAUTOTROPHICALLY IN INORGANIC STERILISED MEDIUM, ENRICHED WITH 1.9MMOL PER LITER NAHCO3 AS BUFFER//DNUM/13//TESTID/1//OEF, BASELINE EC50 BASED ON QSAR, NOT CODED, FROM REVIEW// + +NA + +10/24/2007 + +NA + +NA +
+2200676 + +2110007 + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +6 + +NA + +NA + +NA + +NA + +wk + +NA + +NOEC + +NA + +DEC + +POP + +NA + +ABND/ + +STATS TO V CONTROL + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +A + +NA + +NA + +NA + +NA + +4.33 + +NA + +268.00 + +ug/L + +ONLY CONC TESTED + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +ANOSIG + +A + +NA + +0.05 + +NA + +NA + +NA + +NA + +NA + +M + +NA + +R + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +CONC1/ONLY CONC TESTED// + +NA + +06/11/2014 + +06/11/2014 + +NA +
+2380412 + +2190107 + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +96 + +NA + +NA + +NA + +NA + +h + +NA + +NOEL + +NA + +DEC + +CEL + +NA + +NLEI/ + +RELATIVE VOLUME FRACTION. MITOCHONDRIA ALSO REPORTED. SEVERAL STATISTICAL METHODS USED. + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +A + +NA + +NA + +89 + +NA + +NA + +NA + +NA + +ug/L + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +ANOSIG + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +M + +NA + +R + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +NA + +CHAR/USED WITH TREATMENT.//DNUM/4//DOSES/22/42/89 ug/L// + +NA + +04/11/2017 + +04/11/2017 + +NA +
+
+
# Review of the substance_table
+substances <-
+    readr::read_delim(
+        file = normalizePath(
+            path = file.path(
+                database_folder,
+                "validation",
+                "chemicals.txt"
+            ),
+        ),
+        show_col_types = FALSE,
+        delim = "|"
+        
+    )
+
+kable(
+    substances %>%
+        head(5),
+    format = "html", digits = 2
+) %>%
+kable_styling("striped", full_width = TRUE) %>% 
+scroll_box(width = "700px", height = "300px")
+
+
+ + + + + + + + + + + + + + + + +
+cas_number + +chemical_name + +ecotox_group + +dtxsid +
+1912249 + +6-Chloro-N-ethyl-N’-(1-methylethyl)-1,3,5-triazine-2,4-diamine + +Endocrine Disrupting Chemicals (EDCs) + +DTXSID9020112 +
+
+
# Review of the substance_table
+references <-
+    readr::read_delim(
+        file = normalizePath(
+            path = file.path(
+                database_folder,
+                "validation",
+                "references.txt"
+            ),
+        ),
+        show_col_types = FALSE,
+        delim = "|"
+        
+    )
+
+kable(
+    references %>%
+        head(5),
+    format = "html", digits = 2
+) %>%
+kable_styling("striped", full_width = TRUE) %>% 
+scroll_box(width = "700px", height = "300px")
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+reference_number + +reference_db + +reference_type + +author + +title + +source + +publication_year +
+95 + +A + +NA + +Abou-Waly,H., M.M. Abou-Setta, H.N. Nigg, and L.L. Mallory + +Growth Response of Freshwater Algae, Anabaena flos-aquae and Selenastrum capricornutum to Atrazine and Hexazinone Herbicides + +Bull. Environ. Contam. Toxicol.46(2): 223-229 + +1991 +
+344 + +AT + +OPP + +U.S. Environmental Protection Agency + +Pesticide Ecotoxicity Database (Formerly: Environmental Effects Database (EEDB)) + +Environmental Fate and Effects Division, U.S.EPA, Washington, D.C.: + +1992 +
+393 + +A + +NA + +Hersh,C.M., and W.G. Crumpton + +Atrazine Tolerance of Algae Isolated From Two Agricultural Streams + +Environ. Toxicol. Chem.8(4): 327-332 + +1989 +
+624 + +A + +NA + +Valentine,J.P., and S.W. Bingham + +Influence of Algae on Amitrole and Atrazine Residues in Water + +Can. J. Bot.54(18): 2100-2107 + +1976 +
+682 + +A + +NA + +Isensee,A.R. + +Variability of Aquatic Model Ecosystem-Derived Data + +Int. J. Environ. Stud.10:35-41 + +1976 +
+
+
# Review of the substance_table
+species <-
+    readr::read_delim(
+        file = normalizePath(
+            path = file.path(
+                database_folder,
+                "validation",
+                "species.txt"
+            ),
+        ),
+        show_col_types = FALSE,
+        delim = "|"
+        
+    )
+
+kable(
+    species %>%
+        head(5),
+    format = "html", digits = 2
+) %>%
+kable_styling("striped", full_width = TRUE) %>% 
+scroll_box(width = "700px", height = "300px")
+
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+species_number + +common_name + +latin_name + +kingdom + +phylum_division + +subphylum_div + +superclass + +class + +tax_order + +family + +genus + +species + +subspecies + +variety + +ecotox_group + +ncbi_taxid +
+31 + +Green Algae + +Chlorella ovalis + +Plantae + +Chlorophyta + +NA + +NA + +Chlorophyceae + +Chlorococcales + +Oocystaceae + +Chlorella + +ovalis + +NA + +NA + +Algae + +3071 +
+47 + +Green Algae + +Scenedesmus sp. + +Plantae + +Chlorophyta + +NA + +NA + +Chlorophyceae + +Chlorococcales + +Scenedesmaceae + +Scenedesmus + +sp. + +NA + +NA + +Algae + +3087 +
+147 + +Green Algae + +Spirogyra crassa + +Plantae + +Chlorophyta + +NA + +NA + +Chlorophyceae + +Zygnematales + +Zygnemataceae + +Spirogyra + +crassa + +NA + +NA + +Algae + +NA +
+298 + +Green Algae + +Dunaliella tertiolecta + +Plantae + +Chlorophyta + +NA + +NA + +Chlorophyceae + +Volvocales + +Dunaliellaceae + +Dunaliella + +tertiolecta + +NA + +NA + +Standard Test SpeciesAlgae + +3047 +
+300 + +Green Algae + +Chlorococcum sp. + +Plantae + +Chlorophyta + +NA + +NA + +Chlorophyceae + +Chlorococcales + +Chlorococcaceae + +Chlorococcum + +sp. + +NA + +NA + +Algae + +44649 +
+
+
+
+

4.5 Preparation of the database environment and initialisation of the project folder

+

In the first step, the function create_project creates the database project and +initializes the database and project folders:

+
    +
  1. Load the ASCII files.

  2. +
  3. Create a file chemical_properties.csv based o the chemicals.txt table. +If this table exists, it is loaded.

  4. +
  5. Store the initial database project in project.Rdata in the database folder.

  6. +
  7. Store the initial projcet in initial_project.Rdata in the project folder.

  8. +
+
+

4.5.1 Parameters

+
    +
  • initialise_database_project (TRUE/FALSE): Creates the basic database project +from the current ASCII files in the database folder and (if not existing) stores +the chemical_properties.csv in the database folder.

  • +
  • initialise_project (TRUE/FALSE): Stores the REcoTox environment in an initial +Rdata object named initial_project.Rdata in the project folder.

  • +
  • load_default (TRUE/FALSE): Loads an existing basic database project from the +database folder and stores it in the project.

  • +
+

chemical_properties.csv: This tables contains the internal chemical ID cas_number (i.e.  +the cas number in integer format) and related user-curated metadata (e.g., +chemical identifiers such as InChIKey, or PubChem CIDs) and chemical property +data (i.e. log S values). It will be re-used and extended in future +analyses to minimize curation efforts. If this file exists, it will be loaded +to the project environment. Because the chemicals.txt table only contains CAS numbers +in integer format, a regular CAS number is added (e.g., 1912-24-9 for 1912249).

+
project <- REcoTox::create_project(database_path = database_folder,
+                          project_path,
+                          initalise_database_project = TRUE,
+                          initalise_project = TRUE,
+                          load_default = FALSE)
+
+
+
+

4.6 Preparation of the initial project

+

In the second step, utilizing the function prepare_data, the tables’ test, +results, species, chemicals, and references are joined IDs test_id, +cas_number, species_number, and reference_number. The initial environment +is stored in the file ìnitial_project.Rdata in the project folder. +This file will be the same for all analyses related to a database revision. +Thus, it could be copied from another project to avoid rerunning initial steps.

+
+

4.6.1 Parameters

+
    +
  • project: Name of the project environment.

  • +
  • load_initial_project (TRUE/FALSE): Loads the initial_project.Rdata of the +project folder.

  • +
  • new_project_path: The initial_project.Rdata contains the project folder +path where it was initially created. For example, in case of moving the project +folder or if the initial_project.Rdata was copied from another folder, it is +required to set a new project path.

  • +
  • save_project (TRUE/FALSE): Save the initial_project.Rdata. For example, +in case, the project folder was renewed.

  • +
+
project <- REcoTox::prepare_data(project = project,
+                        load_initial_project = FALSE,
+                        new_project_path = NA,
+                        save_project = TRUE
+)
+
+
+
+

4.7 Processing the data

+

In the third step, the function process_data reads the following settings to +query the database accordingly.

+

A list of relevant endpoints (e.g., EC50) and all relevant species are generated +and exported to the project folder for review. +The two files are ecotoxgroup_endpoint_selection.csv and +ecotoxgroup_species_selection.csv. The review could be performed in any +spreadsheet program. The data must be stored in the comma delimited format!

+

The former table contains a field include_endpoint, this controls the inclusion +of each endpoint by setting the value to 0 or 1 (0 = not included, 1 = included). +Other values are not accepted, and the import of the file in the next processing +step will be declined. The value 0 is the default, and thus, the endpoints to +be included should be marked with 1.

+

The latter table contains a field include_species, this controls the inclusion of each species by setting the value to 0 or 1 (0 = not included, 1 = included). Depending on the settings of species_selection, the preset is different:

+
    +
  • include_species is set to 1

  • +
  • include_species is set to 0

  • +
  • include_species is set to 1 for standard test species and set to 0 for other species

  • +
+

Review and edit the tables in a preferred spreadsheet program. If changed, +save the changes in the same file. The separator must be comma.

+

In this step, the database is queried to select the datasets related to the +goals of the analysis.

+

The queries can be controlled by the following parameters:

+
    +
  • dosing_group: Specifies the compartment to which the dosing is referenced (so far only “water_concentration”, i.e. result is mg/L)

  • +
  • duration_d: Duration of the exposure in days (e.g. d, dph, dpf)

  • +
  • duration_h: Duration of the exposure in hours (e.g. h, ht, hph, hpf, hbf, hv)

  • +
  • duration_m: Duration of the exposure in minutes (e.g. mi)

  • +
  • ecotoxgroup: Species group (e.g. Algae, Crustacean, Fish)

  • +
  • effects: Effect endpoints (e.g. MOR, GRO, DEV)

  • +
  • habitat: Habitat of the ecotoxgroup (i.e. Non-Soil, Water, Soil)

  • +
  • kingdoms: Specification of the algae kingdoms (e.g. Chromista, Plantae, Monera)

  • +
  • measurements: Specification of specific measurements

  • +
  • min_h: Minimum duration of the experiment in hours

  • +
  • max_h: Maximum duration of the experiment in hours

  • +
  • min_d: Minimum duration of the experiment in days

  • +
  • max_d: Maximum duration of the experiment in days

  • +
  • min_m: Minimum duration of the experiment in minutes

  • +
  • max_m: Maximum duration of the experiment in minutes

  • +
  • species_seleciton: Selection of species (i.e. all, manual, standard_test_species)

  • +
+

Where all selects all species of an ecotoxgroup, manual expects manual selection in +the files mentioned above and standard_test_species selects only species marked as +standardized species.

+
+

4.7.1 Filtering the data

+

In the processing step 1, the data in the database is filtered based on the settings +to extract relevant data of the database.

+

+# set the parameters
+dosing_group = "water_concentration" # i.e. mg/L (only available group in this version)
+duration_d = c("d", "dph", "dpf")
+duration_h = c("h", "ht", "hph", "hpf", "hbf", "hv")
+duration_m = "mi"
+ecotoxgroup = "Algae" # c("Algae", "Crustacean", "Fish")
+effects = c("MOR", "GRO", "POP", "REP", "MPH", "DEV") # Algae/Fish
+#effects = c("MOR", "GRO", "POP", "REP", "MPH", "DEV", "ITX") # Crustacean
+habitat = "Water" #c("Non-Soil","Water","Soil")
+kingdoms = NA # vector of specific algae kingdoms: c("Chromista","Plantae","Monera")
+measurements = NA # vector of specific measurements
+min_h = 0
+min_d = 0
+max_h = 120
+max_d = 5
+min_m = 0
+max_m = 7200
+species_selection = "all" # c("all", "manual", "standard_test_species")
+
+# run the processing step
+project <- REcoTox::process_data(project,
+                        dosing_group = dosing_group,
+                        duration_d = duration_h,
+                        duration_h = duration_h,
+                        duration_m = duration_m,
+                        ecotoxgroup = ecotoxgroup,
+                        effects = effects,
+                        habitat = habitat,
+                        kingdoms = kingdoms,
+                        measurements = measurements,
+                        max_d = max_d,
+                        min_d = min_d,
+                        max_h = max_h,
+                        min_h = min_h,
+                        max_m = max_m,
+                        min_m = min_m,
+                        remove_formulation = FALSE,
+                        save_project_steps = FALSE,
+                        species_selection = species_selection
+)
+
+

This step stores two files in the project_folder, ecotoxgroup_species_selection.csv +and ecotoxgroup_endpoint_selection.csv. The first block of the file is related to the +ecotoxgroup specified. The species selection file contains all species extracted +for review and the endpoint the respective endpoints (e.g. EC50). To include +species or endpoints, mark the data with 1, otherwise to exclude, mark with 0.

+
+
+

4.7.2 Filtering species and endpoints

+

After review and saving the files, run the following command. This command +reads the files and the data is filtered accordingly.

+

The units in the database are quite divergent and thus a unit conversion is performed +to transform all units and values to mg/L. In case of mol related units, +the transformation is automated so far the chemical and the molecular weight is +already in the database. If not, the file ecotoxgroup_mol_weight.csv is exported +to the project_folder.

+
project <- REcoTox::process_data(project, save_project_steps = FALSE)
+
+
+
+

4.7.3 Unit conversion

+

Review and edit the file ecotoxgroup_mol_weight.csv to add the molecular weight to +the list. The ecotoxicity data is interactivitely enriched with chemical information +(e.g. the average mass).

+

In best case with data linked to US EPA CompTox Chemicals Dashboard +for example by using the output of the batch search +according to Figure 1 and Figure 2.

+
+Figure1: US EPA CompTox Chemicals Dashboard Batch Search - Enter Identifiers to Search +
Figure1: US EPA CompTox Chemicals Dashboard Batch Search - Enter Identifiers to Search
+
+
+Figure 2: US EPA CompTox Chemicals Dashboard Batch Search - Recommended selection of identifiers and properties +
Figure 2: US EPA CompTox Chemicals Dashboard Batch Search - Recommended selection of identifiers and properties
+
+

After update of the mol weight table, run the following command to finalise the +unit conversion step.

+
project <- REcoTox::process_data(project, save_project_steps = FALSE)
+
+
+
+

4.7.4 Chemical properties data and final processing

+

The former processing step creates a file named ecotoxgroup_chemical_list.csv. +Edit this list to include newly added compounds (imputation of phys.-chem. +properties and metadata).

+

To score the quality of the data, the solubility domain of the result is +calculated. The calculation requires the experimental or predicted solubility +of the chemical.

+
project <- REcoTox::process_data(project, save_project_steps = FALSE, update_chemicals = FALSE)
+
+

The file ecotoxgroup_final_results.csv is stored in the project_folder. +It contains the results of the processing in the long pivot format.

+
+
+

4.7.5 Preparation of the wide pivot table with the aggregated ecotoxicity information

+

For final processing and to aggregate the data in the wide pivot format, +run the following final step.

+
project <- REcoTox::aggregate_results(project = project, quantile = 0.05)
+
+

``{r view references, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the substance_table +references <- +readr::read_delim( +file = normalizePath( +path = file.path( +database_folder, +“validation”, +“references.txt” +), +), +show_col_types = FALSE, +delim = “|”

+
)
+

kable( +references %>% +head(5), +format = “html”, digits = 2 +) %>% +kable_styling(“striped”, full_width = TRUE) %>% +scroll_box(width = “700px”, height = “300px”)

+

+An example of the final data:
+
+<div style="border: 1px solid #ddd; padding: 0px; overflow-y: scroll; height:300px; overflow-x: scroll; width:700px; "><table class="table table-striped" style="margin-left: auto; margin-right: auto;">
+<tbody>
+  <tr>
+
+  </tr>
+</tbody>
+</table></div>
+
+\newpage
+
+# SessionInfo
+
+
+```r
+sessionInfo()
+
## R version 4.3.1 (2023-06-16)
+## Platform: x86_64-pc-linux-gnu (64-bit)
+## Running under: Ubuntu 23.10
+## 
+## Matrix products: default
+## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
+## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.23.so;  LAPACK version 3.11.0
+## 
+## locale:
+##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
+##  [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=C              
+##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
+##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
+##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
+## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
+## 
+## time zone: Europe/Berlin
+## tzcode source: system (glibc)
+## 
+## attached base packages:
+## [1] stats     graphics  grDevices utils     datasets  methods   base     
+## 
+## other attached packages:
+##  [1] lubridate_1.9.3  forcats_1.0.0    stringr_1.5.0    dplyr_1.1.3     
+##  [5] purrr_1.0.2      readr_2.1.4      tidyr_1.3.0      tibble_3.2.1    
+##  [9] ggplot2_3.4.4    tidyverse_2.0.0  kableExtra_1.3.4 desc_1.4.2      
+## [13] BiocStyle_2.28.1 REcoTox_0.4.1   
+## 
+## loaded via a namespace (and not attached):
+##  [1] tidyselect_1.2.0    viridisLite_0.4.2   fastmap_1.1.1      
+##  [4] promises_1.2.1      digest_0.6.33       timechange_0.2.0   
+##  [7] mime_0.12           lifecycle_1.0.3     ellipsis_0.3.2     
+## [10] processx_3.8.2      magrittr_2.0.3      compiler_4.3.1     
+## [13] rlang_1.1.1         sass_0.4.7          progress_1.2.2     
+## [16] tools_4.3.1         utf8_1.2.4          yaml_2.3.7         
+## [19] data.table_1.14.8   knitr_1.44          prettyunits_1.2.0  
+## [22] htmlwidgets_1.6.2   bit_4.0.5           pkgbuild_1.4.2     
+## [25] xml2_1.3.5          pkgload_1.3.3       miniUI_0.1.1.1     
+## [28] withr_2.5.1         grid_4.3.1          fansi_1.0.5        
+## [31] roxygen2_7.2.3      urlchecker_1.0.1    profvis_0.3.8      
+## [34] xtable_1.8-4        colorspace_2.1-0    data.tree_1.0.0    
+## [37] scales_1.2.1        cli_3.6.1           rmarkdown_2.25     
+## [40] crayon_1.5.2        generics_0.1.3      remotes_2.4.2.1    
+## [43] webchem_1.3.0       rstudioapi_0.15.0   httr_1.4.7         
+## [46] tzdb_0.4.0          sessioninfo_1.2.2   cachem_1.0.8       
+## [49] parallel_4.3.1      rvest_1.0.3         BiocManager_1.30.22
+## [52] vctrs_0.6.4         devtools_2.4.5      webshot_0.5.5      
+## [55] jsonlite_1.8.7      bookdown_0.36       callr_3.7.3        
+## [58] hms_1.1.3           bit64_4.0.5         systemfonts_1.0.5  
+## [61] jquerylib_0.1.4     glue_1.6.2          ps_1.7.5           
+## [64] stringi_1.7.12      gtable_0.3.4        later_1.3.1        
+## [67] EnvStats_2.8.1      munsell_0.5.0       pillar_1.9.0       
+## [70] htmltools_0.5.6.1   R6_2.5.1            Rdpack_2.5         
+## [73] rprojroot_2.0.3     vroom_1.6.4         evaluate_0.22      
+## [76] shiny_1.7.5.1       highr_0.10          rbibutils_2.2.15   
+## [79] memoise_2.0.1       httpuv_1.6.12       bslib_0.5.1        
+## [82] Rcpp_1.0.11         svglite_2.1.2       xfun_0.40          
+## [85] fs_1.6.3            usethis_2.2.2       pkgconfig_2.0.3
+
+
+
+
+
+

References

+
+
+1. LibreOffice - free office suite. https://www.libreoffice.org/. +
+
+
+ + + +
+
+ +
+ + + + + + + + + + + + + + + + + + + diff --git a/vignettes/REcoTox_PDF.R b/vignettes/REcoTox_PDF.R new file mode 100644 index 0000000..756456f --- /dev/null +++ b/vignettes/REcoTox_PDF.R @@ -0,0 +1,197 @@ +## ----biocstyle, echo = FALSE, results = "asis"-------------------------------- +BiocStyle::markdown() + +## ----init, message = FALSE, echo = FALSE, results = "hide"-------------------- +## Silently loading all packages +library(BiocStyle) +library(desc) +library(kableExtra) +library(tidyverse) + +## ----load REcoTox package, eval = FALSE, echo = TRUE, message = FALSE, warning = FALSE---- +# # Load the REcoTox package +# library(REcoTox) + +## ----R Documentation, echo = TRUE, eval = FALSE------------------------------- +# # Documentation of REcoTox +# help(package = "REcoTox") + +## ----initialize folders, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE---- +# Path of the project folder +project_folder <- "REcoTox_demo" + +database_folder <- system.file("extdata/database_folder", package="REcoTox") +# The project folder is created in the home directory +project_path <- normalizePath(ifelse(.Platform$OS.type == "unix", + paste0("~/", project_folder), + paste0( + Sys.getenv("HOMEPATH"), + "\\", + project_folder + ) +)) + +# An existing folder is deleted +#if (dir.exists(project_folder)) { +# unlink(project_folder, recursive = TRUE) +#} + +## ----create project, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE---- +project <- REcoTox::create_project(database_path = database_folder, + project_path, + initalise_database_project = TRUE, # create the basic project from current ASCII files in DB folder + initalise_project = TRUE, # initializes the project folder + load_default = FALSE) # loads the default project in the project folder in the memoryfault_example = TRUE + +file.copy( + from = system.file( + "extdata", + "Query_EcoTox_DB.R", + package = "REcoTox" + ), + to = normalizePath( + path = file.path( + project_folder, + "Query_EcoTox_DB.R" + ), + winslash = "\\", + mustWork = FALSE + ), + overwrite = TRUE + ) + + +## ----list project folder------------------------------------------------------ +# List files and directories in project_folder +list.files(project_folder, recursive = TRUE, include.dirs = TRUE) + +## ----list database folder----------------------------------------------------- +# List files and directories in project_folder +list.files(database_folder, recursive = TRUE, include.dirs = TRUE) + +## ----view chemical_properties, echo = TRUE, eval = TRUE, message = TRUE------- +# Review of the chemical properties +chemical_properties <- readr::read_csv(file = normalizePath(path = file.path( + database_folder, + "chemical_properties.csv" +), ), show_col_types = FALSE) + +kable( + chemical_properties %>% + select(cas_number:dtxsid_ecotox) %>% + head(5), + format = "latex", digits = 2 +) + +## ----view results, echo = TRUE, eval = TRUE, message = TRUE------------------- +# Review of the result table +results <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "results.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + results %>% + select(result_id:sample_size_mean) %>% + head(5), + format = "latex", digits = 2 +) + +## ----view chemicals, echo = TRUE, eval = TRUE, message = TRUE----------------- +# Review of the substance_table +substances <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "chemicals.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + substances %>% + select(cas_number:ecotox_group) %>% + head(5), + format = "latex", digits = 2 +) + +## ----view references, echo = TRUE, eval = TRUE, message = TRUE---------------- +# Review of the substance_table +references <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "references.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + references %>% + select(reference_number:author) %>% + head(5), + format = "latex", digits = 2 +) + +## ----view species, echo = TRUE, eval = TRUE, message = TRUE------------------- +# Review of the substance_table +species <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "species.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + species %>% + select(species_number:kingdom) %>% + head(5), + format = "latex", digits = 2 +) + +## ----initialize databases, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE---- +# project <- REcoTox::create_project(database_path = database_folder, +# project_path, +# initalise_database_project = TRUE, +# initalise_project = TRUE, +# load_default = FALSE) +# ) + +## ----initialize project, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE---- +# project <- REcoTox::prepare_data(project = project, +# load_initial_project = FALSE, +# new_project_path = NA, +# save_project = TRUE +# ) + +## ----sessioninfo, echo = TRUE, eval = TRUE, message = FALSE------------------- +sessionInfo() + +## ----clean_up, echo = FALSE, results = "asis", eval = FALSE------------------- +# #unlink(project_folder, recursive = TRUE) + diff --git a/vignettes/REcoTox_PDF.Rmd b/vignettes/REcoTox_PDF.Rmd index 5a6b7da..276a184 100644 --- a/vignettes/REcoTox_PDF.Rmd +++ b/vignettes/REcoTox_PDF.Rmd @@ -1,1462 +1,594 @@ ---- -title: "REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII files (v`r desc::desc_get_version()`)" -shorttitle: "REcoTox (version `r desc::desc_get_version()`)" -author: | - | Tobias Schulze - | Helmholtz Centre for Environmental Research - UFZ, Leipzig, Germany - | tobias.schulze@ufz.de -date: 2023-09-30 -output: - bookdown::pdf_document2: - toc: true - toc_depth: 2 - number_sections: true - latex_engine: "pdflatex" - global_numbering: true -vignette: > - %\VignetteIndexEntry{REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII files (PDF version)} - %\VignetteKeywords{E} - %\VignettePackage{REcoTox} - %\VignetteEncoding{UTF-8} - %\VignetteEngine{knitr::rmarkdown} - %\VignetteDepends{BiocStyle, desc} -bibliography: references.bib -csl: biomed-central.csl -editor_options: - markdown: - wrap: 72 ---- - -```{r biocstyle, echo = FALSE, results = "asis"} -BiocStyle::markdown() -``` - -```{r init, message = FALSE, echo = FALSE, results = "hide" } -## Silently loading all packages -library(BiocStyle) -library(desc) -library(kableExtra) -library(tidyverse) -``` - -\newpage - -# Background -The search and extraction of experimental ecotoxicological information is often -a tedious work. A good and comprehensive data source is the -[US EPA ECOTOX Knowledgebase](https://cfpub.epa.gov/ecotox/ "US EPA ECOTOX Knowledgebase"). -It contains more than 1 million data points for almost 13,000 chemicals -and 14,000 single species. However, for a high-throughput hazard assessment, -it is not possible to extract all relevant data of the online database. -The purpose of REcoTox is to extract the relevant information and to aggregate -the data based on the user criteria out of the entire database -[ASCII files](https://gaftp.epa.gov/ecotox/ecotox_ascii_03_10_2022.zip "ECOTOX Knowledgebase ASCII files"). - -# Introduction - -[REcoTox](https://github.com/tsufz/REcoTox) is a semi-automated, interactive -workflow to process [US EPA ECOTOX Knowledgebase](https://cfpub.epa.gov/ecotox/ "US EPA ECOTOX Knowledgebase") -entire database [ASCII files](https://gaftp.epa.gov/ecotox/ecotox_ascii_03_10_2022.zip "ECOTOX Knowledgebase ASCII files") -to extract and process ecotoxicological data relevant (but not restricted) to -the ecotoxicity groups algae, crustaceans, and fish in the aquatic domain. -The focus is aquatic ecotoxicity and the unit of the retrieved data is `mg/L`. - -\newpage - -# Input files and folders - -`REcoTox` requires an unzipped `US EPA Knowlegdebase` database in `ASCII` format (Zitat). -The database is preferable expanded in an own database folder to be defined during the processing, -The database consists of relatively referenced text files. The separator of the data is the pipeline `|` symbol. - -In the first session of `REcoTox`, a file `chemical_properties.csv` is created in the database folder. -This files contains chemical identifiers and chemical properties required for the processing of the -chemical data in the knowlegdebase and to tag the results. - -The chemical property file is dynamically updated and requires also some manual curation. It will grow as soon -new chemicals are added to the knowledgebase. - -The `project_folder` contains the `R` script for processing as well as the intermediate and final processing files. -The naming of the folder is arbitrary, but do not use spaces, but underscores (`_`) or hyphens (`-`) for separating -parts. - -To run the queries, a predefined processing script is available on `GitHub` ([`Query_EcoTox_DB.R`](https://github.com/tsufz/REcoTox/blob/main/inst/extdata/Query_Ecotox_DB.R)) or in the local `REcoTox` package folder. - - -\newpage - -# Using REcoTox - -The following tutorial explains the different steps of `REcoTox` in a -comprehensive demonstration. `REcoTox` includes different interactive -steps, which require the evaluation of comma separated text files -(`*.csv`) in an external spreadsheet application (preferable LibreOffice -[@LibreOffice]). - -## Load the REcoTox package - -```{r load REcoTox package, eval = TRUE, echo = TRUE, message = FALSE, warning = FALSE} -# Load the REcoTox package -library(REcotox) -``` - -## Documentation for MZquant - -A detailed description of all functions of `REcoTox` functions is available in the -`R Documentation`. - -```{r R Documentation, echo = TRUE, eval = FALSE} -# Documentation of REcoTox -help(package = "REcoTox") -``` - -## Preparation of the working environment (for beginners) - -The processing in `REcoTox` is interactivally controlled by a processing -script `Query_EcoTox_DB.R`. - -If you run `REcoTox` for the first time, a tutorial project is available -to demonstrate all important steps of `REcoTox` processing. The following -script is preparing an example folder in your home directory and copies -all necessary files in the folder. - -```{r initialize folders, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -# Path of the project folder -project_folder <- "REcoTox_demo" -database_folder <- system.file("extdata", package="REcoTox") - -# The project folder is created in the home directory -project_folder <- normalizePath(ifelse(.Platform$OS.type == "unix", - paste0("~/", project_folder), - paste0( - Sys.getenv("HOMEPATH"), - "\\", - project_folder - ) -)) - -# An existing folder is deleted -if (dir.exists(project_folder)) { - unlink(project_folder, recursive = TRUE) -} -``` - -This command initializes the project folder and the database folder. It copies also the processing script to the project folder. - -```{r initialize project, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -project <- create_project(database_path, - project_path, - initalise_database_project = TRUE, # create the basic project from current ASCII files in DB folder - initalise_project = TRUE, # initializes the project folder - load_default = FALSE) # loads the default project in the project folder in the memoryfault_example = TRUE - -file.copy( - from = system.file( - "extdata", - "Query_EcoTox_DB.R", - package = "REcoTox" - ), - to = normalizePath( - path = file.path( - project_folder, - "Query_EcoTox_DB.R" - ), - winslash = "\\", - mustWork = FALSE - ), - overwrite = TRUE - ) - -``` - -The `project_folder` contains the following files: - -```{r list project folder} -# List files and directories in project_folder -list.files(project_folder, recursive = TRUE, include.dirs = TRUE) -``` - -The `database_folder` contains the following files and folders: -`chemical_properties.csv` is the file containing the curated chemical properties, -`results.txt` contains the testing results collected in the knowledgebase, and -`test.txt` contains the the metadate of the tests. - -The folder `validation` contains the files `chemicals.txt` with chemical information, -the file `references.txt` contains the references and `species.txt` the species. - -```{r list project folder} -# List files and directories in project_folder -list.files(database_folder, recursive = TRUE, include.dirs = TRUE) -``` - -It contains only the `Query_EcoTox_DB.R` file. - -## Review of the input data - -To review the input data, let us look in the data: - -```{r view chemical_properties, echo = TRUE, eval = TRUE, message = TRUE} -# Review of the chemical properties -chemical_properties <- readr::read_csv(file = normalizePath(path = file.path( - database_folder, - "chemical_properties.csv" -), ), show_col_types = FALSE) - -kable( - samples %>% - head(5), - format = "latex", digits = 2 -) -``` - -```{r view results, echo = TRUE, eval = TRUE, message = TRUE} -# Review of the substance_table -substances <- - readr::read_delim( - file = normalizePath( - path = file.path( - project_folder, - "results.txt" - ), - ), - show_col_types = FALSE, - delim = "|" - - ) - -kable( - substances %>% - head(5), - format = "latex", digits = 2 -) -``` - -```{r view tests, echo = TRUE, eval = TRUE, message = TRUE} -# Review of the substance_table -substances <- - readr::read_delim( - file = normalizePath( - path = file.path( - project_folder, - "results.txt" - ), - ), - show_col_types = FALSE, - delim = "|" - - ) - -kable( - substances %>% - head(5), - format = "latex", digits = 2 -) -``` - - -## Preparation of the working environment (for experienced users) - -Experienced users can re-use existing `MZquant_processing_script.R` and -`MZquant_settings.yaml` files.^[Note that the workflow and the settings -file can change in a new package version and thus a review of the change -log (NEWS) is recommended.] - -The steps to setup a custom project are: - -1. Create a project folder.^[It is recommended to use the same name as the - analytical batch.] - -2. Copy the custom `MZquant_substances.csv` and the - `MZquant_samples.csv` in the folder. - -3. Re-use existing `MZquant_processing_script.R` and/or - `MZquant_settings.yaml` files. - -4. Or, create new files with the following command. - -```{r initialize new processing files, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} - -# Initialization and setup of the project folder -project_folder <- "your_folder" -settings_file <- "MZquant_settings.yaml" - -# default_settings = TRUE loads the default settings file -# default_processing_script = TRUE loads the default processing script -initialise_project(project_folder, settings_file, - default_processing_script = TRUE, - default_settings = TRUE, - default_example = FALSE -) -``` - -The command will initialize the project folder and copy the default -`MZquant_settings.yaml` and the default `MZquant_processing_script.R` in -your project folder. If you copied existing files in step 3 with similar -names, they will be overwritten by step 4. - -## MZquant processing and settings - -Open the `MZqant_processing_script.R` and the settings file -`MZquant_settings.yaml` for review and further processing. - -```{r load processing and settings file, echo = TRUE, eval = FALSE} - -# Open the processing script -file.edit(file.path(project_folder, "MZquant_processing_script.R")) - -# Open the settings file -file.edit(file.path(project_folder, settings_file)) -``` - -### The processing script - -The `MZquant_processing_script.R` contains all necessary steps of the -MZquant workflow described in this vignette. - -The data of the `MZquant` project is stored in a hidden environment -`.MZquant.env`. - -To review the hidden environment, you may call. - -```{r call enviroment, eval = FALSE, echo = TRUE} -View(.MZquant.env) -``` - -The `.MZquant.env` is stored after each processing step to ensure easy -redo of single processing steps without need to repeat the whole -analysis. - -### The settings file - -The settings file `MZquant_settings.yaml` contains all necessary and -customization settings. Scroll thought the document and make all -required edits, for example: - -- How are the blanks tagged (e.g., `Blank`)? -- How are the calibration columns tagged? -- How are the quality control columns tagged? -- Should a blank correction be performed? -- Which type of blank correction is performed? - -Finally, save the file and go back to the processing script -`MZquant_processing_script.R`. - -The settings file is read in the current `.MZquant.env` environment by -running: - -```{r read in settings, eval = FALSE, echo = TRUE} -# Call the read_settings function to read the filed settings -# into the current `.MZquant.env` -read_settings(settings_file, project_folder) -``` - -The `read_settings()` function is especially helpful if settings are -changed during analysis or old projects are reprocessed with a newer -version of `MZquant`. In the latter case, new features or bug fixes can -be easily applied without need of the tedious reprocessing of the entire -data. - -## Feature list workflow - -The `feature_list_workflow` reads the `MZquant_samples.csv` and the -`MZquant_substances.csv`. It prepares the feature tables for the -analysis in MZquant. - -### Feature list workflow step 1 - -Step 1: Read `MZmine 3` exported feature list `MZquant_samples.csv` and -the substance file `MZquant_substances.csv`. - -```{r} -#| feature list workflow step 1, echo = TRUE, -#| message = FALSE, warning = FALSE, eval = TRUE - -# Feature list workflow step 1 -feature_list_workflow(step = 1) -``` - -- Read the `MZmine 3` exported feature list and the `substance_table`. - -- Merge the tables. - -- Assign standard names (`compound_db_identity:stdname`) to the - annotations in the `feature list` and the `substance_table`. - -- Assign the response method set in - `MZquant_settings.yaml:data:response`. - -- Select mandatory and custom fields set in - `MZquant_settings.yaml:data:meta_mzmine`. - -- Export the new table to `MZquant_samples_refined.csv` in the - `project_folder`. - -The `feature_liste_workflow` step 1 applies a `regular expression` to -the `compound_identity` in the `substance_table`, respectively, to the -column `compound_db_identity:compound_db_identity` in the -`MZquant_samples.csv` to replace any punctuation in the names to ensure -unequivocal standardized names called `stdname`. The `stdname` is one of -the `keys` used in `MZquant`. - -The regular expression is: - -$$stdname = stringi::stri\_replace\_all\_regex(name, "[[:punct:] \backslash \backslash s]+", "\_")$$ - -For example, `3,3'-Dichlorobenzidine` is expressed as -`3_3_Dichlorobenzidine`, which is unequivocally processing. - -In the `samples_refined.csv`, the `stdname` is stored in -`compound_db_identity:compound_db_identity:stdname.` - -Review this table. For experienced users: Take the opportunity to edit -the table to your purposes, for example, add missing QC data by copying -of calibration data. - -*Optional: Load the environment after running -`quantification_workflow(step = 1)`* - -The working environment is stored after each processing step in the root -of the project folder. To reload the environment, run the `load` -command. This is for example helpfully, if a processing required to -break for a while or a `bug` in a later step was tracked down. - -```{r load feature_list_workflow_1.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `feature_list_workflow(step = 1) -load( - file = file.path(project_folder, "MZquant_feature_list_workflow_1.RData"), - envir = .GlobalEnv -) -``` - -### Feature list workflow step 2 - -Step 2: Read and preprocess the refined feature list -`MZquant_samples_refined.csv`. - -```{r feature list workflow step 2, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -# Feature list workflow step 2 -feature_list_workflow(step = 2) -``` - -- Read the MZmine export.csv file and the `substance_table`. - -- Prepare the internal feature list structure. - -- Match the annotations in the feature list with the - `substance_table`. - -- Filter missing substances. - -The columns in `MZquant_settings.yaml:data:meta_mzmine` and -`MZquant_settings.yaml:data:meta_substances` are appended. The -`feature_list_workflow` step 2 stores three files in the -`results/feature_lists` folder. - -```{r list feature_list folder, echo = TRUE} -# List files and directories in the feature list folder -list.files(file.path(project_folder, "results", "feature_lists"), - include.dirs = TRUE -) -``` - -The `target_table_for_review.csv` is the relevant spreadsheet for all -edits in the annotated feature list. Open the list in your favorite -spreadsheet application. To preserve the edits for later review, the -file could be saved in the native spreadsheet format (e.g. `ods` or -`xlsx`). - -The only columns to be edited (do not edit any other column or delete -any row or entry, this will have side effects and you need to repeat -this step): - -- `remove_annotation` -\> set to `1` for removing annotations - -- `newstdname` -\> add a new `stdname` (from - `substance_table_for_review.csv`), if necessary (e.g., in case of - *duplicate masses*) - -- `new_class` -\> class of the annotation (for example of an internal - standard), `c(Target, Suspect, Internal Standard)` - -See the green highlighted columns in [Figure 3](#fig3). - -#### Tips for the data review - -**Hide columns, use auto filters and freeze rows/columns ([Figure -3](#fig3))** - -- Hide all columns you do not need for review (e.g., `stdname`, - `compound_id`, `mzquant_id`). - -- Mark the first row and add the auto filter (data \| AutoFilter). - -- Use the Freeze Rows and Columns functions to freeze the header and - the column with the `StdName`. - -- Columns could be re-arranged, but never delete columns or remove - rows! - -**Hide columns, use auto filters and freeze rows/columns** - -- Use color scales to mark the the blank data the sample data ([Figure - 4](#fig4)). - -- Use the scientific notation for theses columns for straightforward - comparison. - -- Add `duplicate` conditional formatting to column `stdname` to - highlight duplicate names ([Figure 5](#fig5)) - - - -```{r} -#| fig3, fig.cap = "Hide and freeze columns.", -#| fig.align = "center", echo = FALSE, out.width = "80%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_3.png") -``` - - - -```{r} -#| fig4, fig.cap = "Highlight the data with color scales.", -#| fig.align = "center", echo = FALSE, out.width = "100%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_4.png") -``` - - - -```{r} -#| fig5, -#| fig.cap = "Use conditional formatting to annotate duplicates in stdname and remove\\_annotation.", -#| fig.align = "center", -#| echo = FALSE, out.width = "75%", eval = TRUE -knitr::include_graphics("./figures/Figure_5.png") -``` - -**Data review** - -The selection of the correct annotation is not always straightforward. -`MZquant` delivers different qualifiers to support the decision ([Figure -6](#fig6)). - -- Decision criteria, review the following columns: - - - expected `mz` from `MZquant_substances.csv` - - - expected `rt` from `MZquant_substances.csv` - - - detected `row_mz` in MZmine 3 - - - detected `row_rt` in MZmine 3 - - - `annotation_score` estimated in MZmine 3 - - - `row_mz_delta_ppm` estimated in MZmine 3 - - - `deltart` as the difference between expected and detected - retention time - - - `deltamz` as the difference between expected and detected mz - - - `mz_ppm_error` in ppm estimated in MZquant - - - \`quantification_monotonicity\`\` to show monotonicity of the - quantification levels - - - `counts` in blanks, calibrations, samples, and quality controls - - - `blank_features_threshold`^[The `blank_features_threshold` is - the value which is calculated by the blank correction and is - the level of blank noise. All features with lower intensities - than the threshold will be removed in the blank_workflow.] - - - maximum value in the samples - - - -```{r} -#| fig6, fig.cap = "Criteria for the removal of duplicate annotations.", -#| fig.align = "center", echo = FALSE, out.width = "90%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_6.png") -``` - -- Lookup for compounds with low findings in the samples, e.g.: - - - select for example those records with low sample annotations - - - lookup for the better annotation score - - - check if the compound occurs only in the first consecutive - sample following the highest calibration level - - - check for their intensity, if they occur only at the noise level - (1e4. 1e3, compare to blanks), they could be carry over from the - calibration sample - - - check if they only occur in QC samples and so on - - - remove the annotation if the compound is carry over, not - occurring in samples etc. by setting the value in - `remove_annotation` to `1` - - - **Note**: because of the blank correction, some compound without - values in the samples will occur in the trimming table - -- Lookup for duplicates - - - Duplicates should be removed from the table; they occur from - broad peaks, isobaric compounds, retention time shifts, etc. - - - Check the comment field and lookup for existing entries, for - example ([Figure 5](#fig5)): - - - duplicate mass -\> isobaric compounds - - - check the `missed_substances_for_review.csv` and search - for the `unit mass` - - - compare the retention time - - - copy the `StdName` from the missed table to the field - `newStdName` to rename the tagging of the compound - -In addition a screening of the shapes of the peaks in `MZmine 3` is also -helpful, does keep `MZmine 3` always open until the end of your analysis -([Figure 7](#fig7)). However, the decision is also not always -straightforward, if shapes fit well in all cases ([Figure 8](#fig8)). - -It is not always an easy decision, the `deltart` / `deltamz` and the -`mz_ppm_error` might be good for some records, but the ratios -`QC`/`calibration` does not fit well → in cases of concern ask your -supervisors or discuss with colleagues. - -Mark all duplicates to be removed by setting the value in -`remove_annotation` to `1` ([Figure 5](#fig5)) or re-annotate them by -setting a new name in `new_StdName`.Lookup for other curious compounds -and remove them if you like. - -**Note: If you change to scientific notation, change back to `General` -(`LibreOffice`) or `Standard` (`Excel`) formats to avoid loss of -precision!** In recent versions of `Excel`, it is possible to save data -to `CSV UTF-8 (Comma delimited) (*.csv)`. This is recommended. - - - -```{r} -#| fig7, -#| fig.cap = "Peak shapes of duplicate annotations in MZmine 3. An example with straightforward decision.", -#| fig.align = "center", echo = FALSE, out.width = "100%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_7.png") -``` - - - -```{r} -#| fig8, -#| fig.cap = "Peak shapes of duplicate annotations in MZmine 3. An example, that needs more decision criteria.", -#| fig.align = "center", echo = FALSE, out.width = "90%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_8.png") -``` - -*Optional: Load the environment after running -`quantification_workflow(step = 2)`* - -```{r load feature_list_workflow_2.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `feature_list_workflow(step = 2) -load( - file = file.path(project_folder, "MZquant_feature_list_workflow_2.RData"), - envir = .GlobalEnv -) -``` - -To override the revision step in the table, an edited table can be -loaded in the environment. - -```{r read the demonstration target table, echo = TRUE, eval = TRUE, message = FALSE} -# Load the edited `target_table_for_review.csv` -copy_demo_target_table(project_folder = project_folder) -``` - -### Feature list workflow step 3 - -After successful edit of `target_table_for_review.csv`, the table is -loaded and processed in `feature_list_workflow` step 3. This step -updates the `feature list` and prepares the final `feature list` for -further processing. - -```{r feature list workflow step 3, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -# feature list workflow step 3 -feature_list_workflow(step = 3) -``` - -In the case, duplicates are still not annotated, `MZquant` will warn -([Figure 9](#fig9)). In this case, review the `target_table_for_review.csv` -and repeat `feature_list_workflow(step = 2)`.^[This feature can be also used -to quickly check for duplicates.] - - - -```{r} -#| fig9, fig.cap = "Remaining duplicate mass compounds.", -#| fig.align = "center", echo = FALSE, out.width = "75%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_9.png") -``` - -*Optional: Load the environment after running -`feature_list_workflow(step = 3)`* - -```{r load feature_list_workflow_3.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `feature_list_workflow(step = 3) -load( - file = file.path(project_folder, "MZquant_feature_list_workflow_3.RData"), - envir = .GlobalEnv -) -``` - -The `feature list workflow` is finalized. The workflow generates several -files in the *feature_list* folder for interest users. - -```{r list feature_list folder 2, echo = TRUE, eval = TRUE} -# List files and directories in the feature_list folder -list.files(file.path(project_folder, "results", "feature_lists"), - include.dirs = TRUE -) -``` - -## Blank workflow - -The blank workflow runs the `blank tagging`, if required. If the blank -correction is not necessary, set blank_correction: FALSE in -MZquant_settings.yaml. For details see also the linked functions. - -```{r blank workflow, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} -# Blank workflow -blank_workflow() -``` - -`MZquant` integrates two different approaches for blank correction. In -general, a `blank threshold` is calculated of the `blank` values and all -features below this `blank threshold` are deleted of the `feature_list`. - -The first option calculates a simple `blank threshold` based on the -`mean` and the `standard deviation`: - -$$blankthreshold = mean + blankfactor * sd$$ Where `blankfactor` is a -`numerical` factor to control the fold-change of the -`standard deviation` `sd`. - -The second function calculates the `Student t` distribution based -`blank threshold` for each feature based on the -`method detection limit (MDL)` estimation method of US EPA -(@usepa49FR434302011). The method uses a distribution controlled factor -for the addition of the `standard deviation` (`sd`) to the `mean` value: - -Case 1: `n >= minimum number of valid blank values` - -$$blank threshold = mean + qt(p, df = n - 1) * sd$$ - -Where `mean` is the `average` of the blank feature values, `qt` is the -`Students t` density function, `p` is the probability, `df` are the -`degrees of freedom`, `n` is the number of blank values and `sd` is the -`standard deviation`. - -Case 2: `n < minimum number of valid blank values` - -$$blankthreshold = mean + qt(p, df = 1) * sd$$ - -The parameters `p` (`p` = `alpha`) and `n` (`n` = `blank_qt_threshold`) -are set in the *settings file*. - -*Optional: Load the environment after running \`blank_workflow()* - -```{r load blank_workflow.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `blank_workflow()` -load( - file = file.path(project_folder, "MZquant_blank_workflow.RData"), - envir = .GlobalEnv -) -``` - -## Quantification workflow - -The quantification workflow creates the quantification models, enables -trimming of the quantification models and finally performs the -quantification of the targeted compounds. - -### Quantification workflow step 1 - -The `quantification_workflow` step 1 creates the *calibration file -table* `calibration_levels.csv` in ./`results/quantification` for review -and edition of the `calibration levels` and column mapping. The workflow -links the columns with the calibration data with the related -concentration levels. - -```{r quantification workflow 1, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow step 1 -quantification_workflow(steps = 1) -``` - -The tag for the calibration columns is obtained of -`MZquant_settings.yaml` from section `data:standard`. If the tag -`data:standard_level_pattern` has the correct syntax, the calibration -levels are filled automatically. - -**For example:** - -**Calibration column name** - -`220101_17_ESIpos_Calib_std_500_ngL` - -**Standard level pattern** - -`data:standard_level_pattern: "Calib_std"` - -The calibration level must be separated by the following patterns by an -underline (`_`). - -**NOTE** - -`MZquant` cannot handle duplicate level annotations in the current -version. In case of several calibration files for one calibration level, -the levels need to be corrected and distinguished by adding a small -decimal to the concentration level, for example `0.001, 0.002, etc.`. - - - - - - - - - - - - - - - - - - - - - -| filename | concentration | -|:------------------------------------|--------------------------:| -| 220101_12_ESIpos_Calib_std_10_ngL | 10 | -| 220101_15_ESIpos_Calib_std_100_ngL | 100 | -| 220101_18_ESIpos_Calib_std_1000_ngL | 1000 | -| 220101_xx_ESIpos_Calib_std_10_ngL | \textcolor{red}{10.001} | -| 220101_xx_ESIpos_Calib_std_100_ngL | \textcolor{red}{100.001} | -| 220101_xx_ESIpos_Calib_std_1000_ngL | \textcolor{red}{1000.001} | - -*Optional: Load the environment after running -`quantification_workflow(step = 1)`* - -```{r load MZquant_quantification_workflow_1.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `quantification_workflow(step = 1)` -load( - file = file.path(project_folder, "MZquant_quantification_workflow_1.RData"), - envir = .GlobalEnv -) -``` - -### Quantification workflow step 2 - -To load the (edited) `calibration_levels.csv` in the environment run the -`quantification_workflow` step 2. In this step, the -`internal standards (IS)` are assigned to each substance feature and the -relative feature heights (or areas) are calculated by normalization to -the related `IS`. The workflow assigns either the nearest IS (mode -`auto` in the *substance file* column `is_used`) or the predefined IS in -`is_used`. - -```{r quantification workflow 2, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 2) -``` - -The assigned internal standards and the IS calibrated data can be -reviewed in the following files. - -```{r list quantification folder 1, echo = TRUE, eval = TRUE} -# List files and directories in the quantification folder -list.files( - file.path(project_folder, "results", "quantification"), - pattern = "_IS_", include.dirs = TRUE -) -``` - -*Optional: Load the environment after running -\`quantification_workflow(step = 2)* - -```{r load MZquant_quantification_workflow_2.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `quantification_workflow(steps = 2)` -load( - file = file.path(project_folder, "MZquant_quantification_workflow_2.RData"), - envir = .GlobalEnv -) -``` - -*Optional: Review assigned internal standards, reassign internal -standards* - -If you are unhappy with the assignments, open the -`substance_Table_IS_assignment_for_review.csv` and add the standardized -name of the internal standard you like to use in the column `IS_used` -and save the file. - -Then run the `quantification_workflows(step = 2)` again to apply the -changes to your dataset. - -```{r quantification workflow 2 redo, echo = TRUE, eval = FALSE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 2, redo = TRUE) -``` - -*Optional: Load the environment after running the IS reassignment* - -```{r load MZquant_quantification_workflow_redo_2.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `quantification_workflow(steps = 2, redo = TRUE)` -load( - file = file.path( - project_folder, - "MZquant_quantification_workflow_redo_2.RData" - ), - envir = .GlobalEnv -) -``` - -### Quantification workflow steps 3-5 - -The `quantification_workflow` steps 3-5 generates the calibration -models, trimms the models, plots the models, and finally quantifies the -features: - -- step 3: generates generalized additive models (GAM) based on the raw - data, exports the feature table and plots the draft models. - -- step 4: magic trimming of the quantification models, applying - automated trimming, and plotting. - -- step 5: applies manual trimming in the trimming table, generates - trimmed models and plots them, and finally estimates the - concentrations in unknown samples, quality controls, and - quantification data and exports the final table. - -It is recommended to run the full steps 3-4 to get the results of -automated trimming. - -```{r quantification workflow 3, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 3) -``` - -```{r quantification workflow 4, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 4) -``` - -*Optional: Load the environments after the last steps* If something went -wrong, try to go back to the previous steps: - -```{r load MZquant_quantification_workflow_3.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `blank_workflow()` -load( - file = file.path(project_folder, "MZquant_quantification_workflow_3.RData"), - envir = .GlobalEnv -) -``` - -```{r load MZquant_quantification_workflow_4.RData, echo = TRUE, eval = FALSE, message = FALSE} -# Loads the environment after running `blank_workflow()` -load( - file = file.path(project_folder, "MZquant_quantification_workflow_4.RData"), - envir = .GlobalEnv -) -``` - -The purpose of the automated trimming is to limit the calibration models -to a relevant range covering the range of concentrations occurring in -the samples. The idea is to fit the best calibration model possible to -enhance accuracy of the calibration. `MZquant` applies a set of rules to -trim the models (see below). The modeling is based on -`Generalized Additive Models` and includes an automated smoothness -selection (see R package `mgcv` for details). - -However, in many cases an automated trimming is not possible and thus -review and manual trimming of compound is required. The following three -files are required for the manual trimming (in folder -`results/quantification`). - -```{r list peaklist folder 4, echo = TRUE, eval = TRUE} -# List files and directories in the peaklist folder -list.files(file.path(project_folder, "results", "quantification"), - pattern = "trimmed", include.dirs = TRUE -) -``` - -The spreadsheet `quantification_table_trimmed_for_review.csv` is the -main table for reviewing and reprocessing the trimming of the -calibration models. The file -`quantification_table_non-trimmed_for_review.csv` contains the -non-trimmed concentration to relative intensity data. This table is very -useful to restore data, if the original relative intensities have been -manually deleted in the next steps. The pdf-file -`quantification_table_trimmed_model_figs.pdf` includes figs of all -calibration models. - -#### Editing the trimming table - -- Open the `quantification_table_trimmed_model_figs.pdf`. - -- Open the `quantification_table_trimmed_for_review.csv` in your - favorite spreadsheet application. - -- Set the `autofilter` to row 1 ([Figure 10](#fig10)). - -- Insert an empty row above row 1. - -- Hide the first column (`mzquant_id`). - -- Freeze the first column (`stdname`) and the first two rows. - -- Number the calibration columns ("C\_"-columns) beginning from the - columns with the lowest concentration level up to the highest - concentration level. The starting number is generally 3 representing - the position of the lowest concentration level in column C. - -- Select the calibration level columns and the consecutive columns - `in_sample_min` and `in_sample_max`, add a color gradient and set - the format to scientific notation for easier review. - -```{r} -#| fig10, fig.cap = "Prepare trimming table for review.", -#| fig.align = "center", echo = FALSE, out.width = "100%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_10.png") -``` - -It is also recommended to highlight the columns `manual_trim`^[This parameter -controls, if the compound is automatically trimmed (`0`) or not (`1`). -Set it to (`0`) after manual trimming to apply manual trimming. If set to `1`, -trimming is used at all.] and `tracefinder`^[This parameter just is a reminder -evaluate this parameter in a commercial software such as TraceFinder.] with -`1 = red` and `0 = green` as well as the the good (bad) `gam_r2` -([Figure 11](#fig1)). - -```{r} -#| fig11, fig.cap = "Highlight manual\\_trim, tracefinder, and gam\\_r2.", -#| fig.align = "center", echo = FALSE, out.width = "100%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_11.png") -``` - -Check now all substances with `0` findings in the samples and set the -manual_trim and the drop_compound columns to `1` ([Figure 12](#fig12)). -The `0` findings could be selected by `NA` (and 0?) in column -`in_sample_num`. In general, the trim and drop columns should be filled -with `1` automatically. - -```{r} -#| fig12, fig.cap = "Select manual\\_trim and tracefinder columns.", -#| fig.align = "center", echo = FALSE, out.width = "90%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_12.png") -``` - -In the next step, select those rows with only 1 finding in the samples -and lookup for the column `min_sample_name` on the right end of the data -block. Select the first sample after the highest calibration level. This -sample often contains carry-over peaks at a very low level. Compare the -in sample minimal level with the lowest calibration levels and remove -those samples, if the sample level is among the low calibration levels -by setting `manual_trim` to `1` to skip trimming for this compound. - -The easy part of the trimming part is done, the more tedious starts now. - -*Trimming and rules* - -The trimming of the calibration is controlled by the settings in -`min_sample_neighbor_pos` and `max_sample_neighbor_pos` represent the -lower and upper margin of the trimmed calibration curve. Of course, the -limit cannot be below the lowest level and vice versa with the highest -level. - -In the best case, the calibration curve is fitted to the next -calibration level below the lowest relative intensity in the samples or -vice verse at the sample maximum. - -This automated trimming fails in the following cases (not exhaustive). - -Triggered by a rule: - -1. The trimmed range contains an empty field^[Missing values are handled - as `NA` internally in `MZquant`.] or zero value. - -2. The calibration data contains local minima and/or maxima. - -3. The calibration data is not monotonic increasing. - -4. The minimum sample intensity is below the lower calibration limit. - -5. The maximum sample intensity is above the upper calibration limit. - ---\> If you find bugs or cases, report them to Tobias Schulze -([tobias.schulze\@ufz.de](mailto:tobias.schulze@ufz.de){.email}), -please. - -*Manual trimming* - -In the mentioned cases (and maybe others), manual trimming of the -calibration data is required. - -Select all samples again by removing the sample name filter and the -sample number filter. In the `in_sample_num` filter, deselect the `NA` / -`0` rows to hide those rows. In addition deselect those rows which are -`1` in drop for a better overview. - -Lookup for the columns `min_sample_neighbor_pos`, -`max_sample_neighbor_pos`, `in_sample_min`, `in_sample_max`, -`calibration_monocity_trim`, `gam_r2`, `localmin_pos` and `localmax_pos` -and decide which rule is valid for the trimming. Check always the -minimal number of calibration points within your calibration margins. - -For manual trimming you must be set the lower calibration position or -the upper calibration position according to the sample margins ([Figure -13](#fig13)). Avoid to delete values outside the margins, this is a -possible cause of errors. The will be deleted in the next processing -step automatically. - -```{r} -#| fig13, -#| fig.cap = "Example of the trimmed range of the calibration levels.", -#| fig.width = 2, fig.align = "center", echo = FALSE, out.width = "90%", -#| eval = TRUE -knitr::include_graphics("./figures/Figure_13.png") -``` - -Calibration levels within the upper and lower limit of the trimmed -calibration (`min_sample_neighbor_pos`, `max_sample_neighbor_pos`), -could be removed by deleting the value (for example local minima or -local maxima). - -If it is possible to trim the dataset manually, set `manual_trim` to `0` -to apply your settings. If not, you may consider to evaluate in -`TraceFinder` (or another application). Then set the `tracefinder` -column to `1`. You could also comment. The `tracefinder` and -`quantification_comment` is exported in the final output file. - -If something went wrong, open the -`MZquant_samples_non-trimmed_for_review.csv` and just copy the -respective `data block` back to the trimmed data file. - -It is recommended to save the file as an `ods` or `xlsx` file to keep -the formatting, and to export this spreadsheet as -`MZquant_samples_trimmed_for_review.csv`. - -**Important**: Depending on the settings in the spreadsheet application, -the export of the scientific notation might cause loss of decimals and -thus re-format the columns to decimals. In addition, delete the first -column to remove the auxiliary row with the column indexes ([Figure -10](#fig10)) before exporting to the csv file - -*An example in the demonstration data* - -- Lookup for the compound `0003_Phenazone`. - -- Set `min_sample_neighbor_pos` = `5` - -- Set `max_sample_neighbor_pos` = `8` - -- Save the `quantification_table_trimmed_for_review.csv` as described - above. - -To override the trimming in demonstration, an edited table can be loaded -in the environment. - -```{r read the demonstration trimmed table, echo = TRUE, eval = TRUE, message = FALSE} -# Load the edited `target_table_for_review.csv` -copy_demo_trimmed_table(project_folder = project_folder) -``` - -Finally, run the `quantification workflow step 5` to apply the manual -trimming and to quantify. - -```{r quantification workflow 5, echo = TRUE, eval = TRUE, message = FALSE} -# quantification workflow -quantification_workflow(steps = 5) -``` - -The last step can repeated, if you are not happy with the result to try -other manual trimming settings. - -\newpage - -# Appendix - -## Description of the `MZquant_settings.yaml` - -The current format was implemented in `MZquant` version 0.8.1. New -settings or changes will be tagged. - -### Current settings (0.8.1) - -The `MZquant_settings.yaml` is separated in six sections: - -- `settings`: `MZquant_settings.yaml` related parameters - -- `project`: project related parameters - -- `data`: data related parameters - -- `processing`: processing related parameters - -- `quantification`: quantification related parameters - -- `results`: settings to configure final output file with results - -### Section `settings` - -- `version`: The version of the default settings file (do not edit). - The `MZquant` version implementing the current - `MZquant_settings.yaml` format - -### Section `project` - -This section includes the general project settings. - -- `data_file`: Name of the data file containing the `MZmine 3` output. - -- `substances_file`: Name of the file containing the - `substance_table`. - -- `seed`: Integer as seed for random functions. - -- `save_all`: c(`FALSE`, `TRUE`), if `TRUE`, all generated tables are - exported as `csv` for review or debug (default: `FALSE`). - -### Section `data` - -This section contains the parameters describing the tagging of the -fields in the `aligned feature list`, exported by `MZmine 3`. A correct -parametrization is a prerequisite to assign the `samples`, `blanks`, -`quantifications`, and `quality controls` to the correct group and to -avoid biased results. - -- `standard`: Enter the term, which identifies the calibrations - columns (e.g. `Calib`). - -- `standard_level_pattern`: Enter the pattern before the - quantification level, required for automated assignment of - quantification levels (e.g. the correct pattern of - `Calib_water_2018_1000` is `Calib_water_2018`). - -- `quality_control`: Enter the term, defining the `quality control` - samples (e.g. `QC`). - -- `blank`: Enter the term, which identifies `blanks` (e.g. `Blank`). - -- `calibration_blank`: Enter the term, expressing the specific - `calibration blank`, which may contain `internal standards` biasing - the `blank threshold` estimation (e.g. `Calib_std_Blank`). - -- `response`: Enter the response estimation method (c("`area`", - "`intensity`"), default: `area`. - -- `IS_prefix`: Enter the prefix of the internal standard (e.g. - "`IS"`). - -- `meta_mzmine`: Metadata fields to be included from `MZmine 3` export - (default). An updated file will be written to the results folder. - Including the selected fields and feature information (area or - height). The values must be quoted, delimeted by comma, and - emphasized in square brackets, e.g.: ["id", "mz", "mz_range:min", - "mz_range:max", "rt", "rt_range:min", "rt_range:max", - compound_db_identity:compound_db_identity", - compound_db_identity:compound_annotation_score", - "compound_db_identity:mol_formula", - "compound_db_identity:precursor_mz", - "compound_db_identity:mz_diff_ppm", "compound_db_identity:rt", - "manual_annotation:identity", "manual_annotation:comment", - "manual_annotation:compound_name"]. - -- `meta_substances`: Metadata fields to be included from substance - file (default).. An updated file will be written to the results - folder. Including the selected fields and feature information (area - or height). The values must be quoted, delimited by comma, and - emphasized in square brackets, e.g.["compound", - "mzquant:compound_class", "comment", "adduct", "mzquant:mode", - "mzquant:prim_sec"]. - -### Section `processing` - -This section defines the automated `feature_list_workflow` and -`blank_workflow` processing. - -- `meta_substances_to_feature_list`: c(`TRUE`, `FALSE`), if `TRUE`, - the meta substances fields are added to revised MZmine 3 table for - review (default: `TRUE`). - -- `feature_class`: Enter the class which should be processed in blank - correction c("`tagged`","`all`"), be careful, `all` will last longer - time due to unprofessional implementation so far. - -- `blank_correction`: c(`TRUE`, `FALSE`), if `FALSE`, the features are - only tagged with blank thresholds, but not corrected (default: - `TRUE`) - -- `blank_correction_class`: Enter c("`samples`", "`all`") to select - the case of `blank_tagging`: - - - case 1: `samples` - only the features above the - `blank_threshold` are eliminated in samples. - - - case 2: `all` - features above the `blank_threshold` are - eliminated in calibrations and samples. - -- `blank_method`: Method for the blank threshold estimation with - c("`default`", "`qt`"): - - - `default`: `mean`(blanks) + `factor` \* `SD`(blanks) - - - `qt`: `mean`(blanks) + `qt`(`probability`, `degrees of freedom`) - \* SD(blanks)) - -- `blank_qt_alpha`: alpha or probability p for the qt estimation - (default: `0.99`). - -- `blank_qt_threshold`: Minimum number of valid blank features for the - use the `qt` estimation (default: `3`). - -- `blank_factor`: Multiplier of the `SD` for the default blank - correction method (default: `2`). - -### Section `quantification` - -This section contains the parameters for the `quantification_workflow`. - -- `min_cal_points`: Enter the minimum of calibration points required - (default: `4`). - -- `in_cal_neighbors`: Enter the minimum of calibration points of the - minimum sample on the lower end (default: `0`). - -- `max_cal_neighbors`: Enter the minimum of calibration points of the - maximum sample on the upper end (default: `0`) - -- `low_fig_scale_factor`: Enter the scale factor for fitting the low - level calibration range. It allows a better visual inspection of the - low calibration range fits, typical values are `0.1` or `0.05` - (default: `0.05`). - -- `IS_impute_method`: Enter the method for the `IS gap filling`. Use - with caution, it is more an experimental method and should be - mentioned, if used in a productive environment. The default method - `KNN` imputes missing internal standards by k-nearest neighbor - imputation. Other methods are considered as not reliable and biased. - -- `colmax`: Minimum of non missing data in columns passed to the - `impute.KNN::impute` function, expressed in decimals (default: - `0.8`). - -- `rowmax`: Minimum of non missing data in rows passed to the - `impute.KNN::impute` function, expressed in decimals (default: - `0.8`). - -- `IS_method`: Enter the method for the IS assignment for the - calculation of the concentration ratios with c("`SUBTAB`", - "`SAMPTAB`") (default: `SUBTAB`) - -- \`SUBTAB : The RTs of the substance_table are used to assign the - nearest internal standard. - -- `SAMPTAB`: The RTs of the samples table are used to assign the - nearest internal standard. - -- `n_localminmax`: Number of points which define the range of local - minima and maxima (default: `2`). - -- `plot_names`: Plot the names of the compounds during modelling - (helpful for debugging of failing compounds) with c(`FALSE`, `TRUE`) - (default: `FALSE`) - -### Section `results` - -This section includes the settings for the final results file. - -- `digits`: Enter number of digits for the output of decimals - (default: `6`). - -- `unit`: Enter the unit of the final data (default: `ng/L`) - -- `result_metadata`: - -- Selection of metadata columns to be includes in the final output. - Could be any from `MZmine 3` output, `substance_table`, - `target_table_for_review`, or `trimmed_for_review`. - -- Optional: The columns can be sorted in your preferred order. - -- **Warning**: The columns related to `MZmine 3` and the - `substance_table` must be included in the tags `data:meta_mzmine` - and `data:meta_substances`! - -- The values must be quoted, delimited by comma, and emphasized in - square brackets (default: ["compound_id", "compound", - "mzquant:compound_class", "row_mz", "row_rt", - "compound_db_identity:compound_annotation_score", - "compound_db_identity:precursor_mz", "row_mz_delta_ppm", - "manual_annotation:comment", "comment", "adduct", "mzquant:mode", - "mzquant:prim_sec", "mzquant:class", "gam_r2", "tracefinder", - "quantification_comment"]) - -- **Caution**: Some of the mzmine columns were replaced by names in - column 1. In these cases, the new name must be entered: - -- `mzquant_id` = "`id`" - -- `row_mz` = "`mz`" - -- `row_mz_min` = "`mz_range:min`" - -- `row_mz_max` = "`mz_range:max`" - -- `row_rt` = "`rt`" - -- `row_rt_min` = "`rt_range:min`" - -- `row_rt_max` = "`rt_range:max`" - -- `row_mz_delta_ppm` = "`compound_db_identity:mz_diff_ppm`" - -- `compound_id` = "`compound_db_identity:compound_db_identity`" - -- `result_classes`: Classes of sample groups quantified and passed to - the final output with c("`blanks`", "`qc`", "`quantification`", - "`samples`"). - -- The values must be quoted, delimited by comma, and emphasized in - square brackets (default: ["qc", "quantification", "samples"]). - -\newpage - -# SessionInfo - -```{r sessioninfo, echo = TRUE, eval = TRUE, message = FALSE} -sessionInfo() -``` - -\newpage - -# References - -```{r clean_up, echo = FALSE, results = "asis", eval = FALSE} -unlink(project_folder, recursive = TRUE) -``` +--- +title: "REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII files (PDF version)" +shorttitle: "REcoTox (version `r desc::desc_get_version()`)" +author: | + | Tobias Schulze + | Helmholtz Centre for Environmental Research - UFZ, Leipzig, Germany + | tsufz1@gmail.com +date: 2023-11-04 +output: + bookdown::pdf_document2: + toc: true + toc_depth: 2 + number_sections: true + latex_engine: "pdflatex" + global_numbering: true +vignette: > + %\VignetteIndexEntry{REcoTox - a workflow to process US EPA ECOTOX Knowledgebase ASCII files (PDF version)} + %\VignetteKeywords{E} + %\VignettePackage{REcoTox} + %\VignetteEncoding{UTF-8} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteDepends{BiocStyle, desc} +bibliography: references.bib +csl: biomed-central.csl +editor_options: + markdown: + wrap: 72 +--- + +```{r biocstyle, echo = FALSE, results = "asis"} +BiocStyle::markdown() +``` + +```{r init, message = FALSE, echo = FALSE, results = "hide" } +## Silently loading all packages +library(BiocStyle) +library(desc) +library(kableExtra) +library(tidyverse) +``` + +\newpage + +# Background + +The search and extraction of experimental ecotoxicological information +is often a tedious work. A good and comprehensive data source is the [US +EPA ECOTOX +Knowledgebase](https://cfpub.epa.gov/ecotox/ "US EPA ECOTOX Knowledgebase"). +It contains more than 1 million data points for almost 13,000 chemicals +and 14,000 single species. However, for a high-throughput hazard +assessment, it is not possible to extract all relevant data of the +online database. The purpose of REcoTox is to extract the relevant +information and to aggregate the data based on the user criteria out of +the entire database [ASCII +files](https://gaftp.epa.gov/ecotox/ecotox_ascii_03_10_2022.zip "ECOTOX Knowledgebase ASCII files"). + +# Introduction + +[REcoTox](https://github.com/tsufz/REcoTox) is a semi-automated, +interactive workflow to process [US EPA ECOTOX +Knowledgebase](https://cfpub.epa.gov/ecotox/ "US EPA ECOTOX Knowledgebase") +entire database [ASCII +files](https://gaftp.epa.gov/ecotox/ecotox_ascii_03_10_2022.zip "ECOTOX Knowledgebase ASCII files") +to extract and process ecotoxicological data relevant (but not +restricted) to the ecotoxicity groups algae, crustaceans, and fish in +the aquatic domain. The focus is aquatic ecotoxicity and the unit of the +retrieved data is `mg/L`. + +# Input files and folders + +`REcoTox` requires an unzipped `US EPA Knowlegdebase` database in +`ASCII` format (Zitat). The database is preferable expanded in an own +database folder to be defined during the processing, The database +consists of relatively referenced text files. The separator of the data +is the pipeline `|` symbol. + +In the first session of `REcoTox`, a file `chemical_properties.csv` is +created in the database folder. This files contains chemical identifiers +and chemical properties required for the processing of the chemical data +in the knowlegdebase and to tag the results. + +The chemical property file is dynamically updated and requires also some +manual curation. It will grow as soon new chemicals are added to the +knowledgebase. + +The `project_folder` contains the `R` script for processing as well as +the intermediate and final processing files. The naming of the folder is +arbitrary, but do not use spaces, but underscores (`_`) or hyphens (`-`) +for separating parts. + +To run the queries, a predefined processing script is available on +`GitHub` +([`Query_EcoTox_DB.R`](https://github.com/tsufz/REcoTox/blob/main/inst/extdata/Query_Ecotox_DB.R)) +or in the local `REcoTox` package folder. + +# Using REcoTox + +The following tutorial explains the different steps of `REcoTox` in a +comprehensive demonstration. `REcoTox` includes different interactive +steps, which require the evaluation of comma separated text files +(`*.csv`) in an external spreadsheet application (preferable LibreOffice +[@LibreOffice]). + +## Load the REcoTox package + +```{r load REcoTox package, eval = FALSE, echo = TRUE, message = FALSE, warning = FALSE} +# Load the REcoTox package +library(REcoTox) +``` + +## Documentation for MZquant + +A detailed description of all functions of `REcoTox` functions is +available in the `R Documentation`. + +```{r R Documentation, echo = TRUE, eval = FALSE} +# Documentation of REcoTox +help(package = "REcoTox") +``` + +## Preparation of the working environment (for beginners) + +The processing in `REcoTox` is interactivally controlled by a processing +script `Query_EcoTox_DB.R`. + +If you run `REcoTox` for the first time, a tutorial project is available +to demonstrate all important steps of `REcoTox` processing. The +following script is preparing an example folder in your home directory +and copies all necessary files in the folder. + +```{r initialize folders, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} +# Path of the project folder +project_folder <- "REcoTox_demo" + +database_folder <- system.file("extdata/database_folder", package="REcoTox") +# The project folder is created in the home directory +project_path <- normalizePath(ifelse(.Platform$OS.type == "unix", + paste0("~/", project_folder), + paste0( + Sys.getenv("HOMEPATH"), + "\\", + project_folder + ) +)) + +# An existing folder is deleted +#if (dir.exists(project_folder)) { +# unlink(project_folder, recursive = TRUE) +#} +``` + +This command initializes the project folder and the database folder. It +copies also the processing script to the project folder. + +```{r create project, echo = TRUE, message = FALSE, warning = FALSE, eval = TRUE} +project <- REcoTox::create_project(database_path = database_folder, + project_path, + initalise_database_project = TRUE, # create the basic project from current ASCII files in DB folder + initalise_project = TRUE, # initializes the project folder + load_default = FALSE) # loads the default project in the project folder in the memoryfault_example = TRUE + +file.copy( + from = system.file( + "extdata", + "Query_EcoTox_DB.R", + package = "REcoTox" + ), + to = normalizePath( + path = file.path( + project_folder, + "Query_EcoTox_DB.R" + ), + winslash = "\\", + mustWork = FALSE + ), + overwrite = TRUE + ) + +``` + +The `project_folder` contains the following files: + +```{r list project folder} +# List files and directories in project_folder +list.files(project_folder, recursive = TRUE, include.dirs = TRUE) +``` + +The `database_folder` contains the following files and folders: +`chemical_properties.csv` is the file containing the curated chemical +properties, `results.txt` contains the testing results collected in the +knowledgebase, and `test.txt` contains the the metadate of the tests. + +The folder `validation` contains the files `chemicals.txt` with chemical +information, the file `references.txt` contains the references and +`species.txt` the species. + +```{r list database folder} +# List files and directories in project_folder +list.files(database_folder, recursive = TRUE, include.dirs = TRUE) +``` + +It contains only the `Query_EcoTox_DB.R` file. + +## Review of the input data + +To review the input data, let us look in the data: + +```{r view chemical_properties, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the chemical properties +chemical_properties <- readr::read_csv(file = normalizePath(path = file.path( + database_folder, + "chemical_properties.csv" +), ), show_col_types = FALSE) + +kable( + chemical_properties %>% + select(cas_number:dtxsid_ecotox) %>% + head(5), + format = "latex", digits = 2 +) +``` + +```{r view results, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the result table +results <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "results.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + results %>% + select(result_id:sample_size_mean) %>% + head(5), + format = "latex", digits = 2 +) +``` + +```{r view chemicals, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the substance_table +substances <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "chemicals.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + substances %>% + select(cas_number:ecotox_group) %>% + head(5), + format = "latex", digits = 2 +) +``` + +```{r view references, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the substance_table +references <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "references.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + references %>% + select(reference_number:author) %>% + head(5), + format = "latex", digits = 2 +) +``` + +```{r view species, echo = TRUE, eval = TRUE, message = TRUE} +# Review of the substance_table +species <- + readr::read_delim( + file = normalizePath( + path = file.path( + database_folder, + "validation", + "species.txt" + ), + ), + show_col_types = FALSE, + delim = "|" + + ) + +kable( + species %>% + select(species_number:kingdom) %>% + head(5), + format = "latex", digits = 2 +) +``` + +## Preparation of the database environment and initialisation of the project folder + +In the first step, the function `create_project` creates the database project and +initializes the database and project folders: + +1. Load the ASCII files. + +2. Create a file `chemical_properties.csv` based o the `chemicals.txt` table. +If this table exists, it is loaded. + +3. Store the initial database project in `project.Rdata` in the database folder. + +4. Store the initial projcet in `initial_project.Rdata` in the project folder. + +### Parameters + +- `initialise_database_project` (TRUE/FALSE): Creates the basic database project +from the current ASCII files in the database folder and (if not existing) stores +the `chemical_properties.csv` in the database folder. + +- `initialise_project` (TRUE/FALSE): Stores the `REcoTox` environment in an initial +`Rdata` object named `initial_project.Rdata` in the project folder. + +- `load_default` (TRUE/FALSE): Loads an existing basic database project from the +database folder and stores it in the project. + +`chemical_properties.csv`: This tables contains the internal chemical ID `cas_number` (i.e. +the cas number in integer format) and related user-curated metadata (e.g., +chemical identifiers such as InChIKey, or PubChem CIDs) and chemical property +data (i.e. log S values). It will be re-used and extended in future +analyses to minimize curation efforts. If this file exists, it will be loaded +to the project environment. Because the `chemicals.txt` table only contains `CAS` numbers +in integer format, a regular CAS number is added (e.g., 1912-24-9 for 1912249). + +```{r initialize databases, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::create_project(database_path = database_folder, + project_path, + initalise_database_project = TRUE, + initalise_project = TRUE, + load_default = FALSE) +) +``` + +## Preparation of the initial project +In the second step, utilizing the function `prepare_data`, the tables' `test`, +`results`, `species`, `chemicals`, and `references` are joined IDs `test_id`, +`cas_number`, `species_number`, and `reference_number`. The initial environment +is stored in the file `ìnitial_project.Rdata` in the project folder. +This file will be the same for all analyses related to a database revision. +Thus, it could be copied from another project to avoid rerunning initial steps. + +### Parameters + +- `project`: Name of the project environment. + +- `load_initial_project` (TRUE/FALSE): Loads the `initial_project.Rdata` of the +project folder. + +- `new_project_path`: The `initial_project.Rdata` contains the project folder +path where it was initially created. For example, in case of moving the project +folder or if the `initial_project.Rdata` was copied from another folder, it is +required to set a new project path. + +- `save_project` (TRUE/FALSE): Save the `initial_project.Rdata`. For example, +in case, the project folder was renewed. + +```{r initialize project, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::prepare_data(project = project, + load_initial_project = FALSE, + new_project_path = NA, + save_project = TRUE +) +``` + +## Processing the data +In the third step, the function `process_data` reads the following settings to +query the database accordingly. + +A list of relevant endpoints (e.g., EC50) and all relevant species are generated +and exported to the project folder for review. +The two files are `ecotoxgroup_endpoint_selection.csv` and +`ecotoxgroup_species_selection.csv`. The review could be performed in any +spreadsheet program. The data must be stored in the `comma` delimited format! + +The former table contains a field `include_endpoint`, this controls the inclusion +of each endpoint by setting the value to 0 or 1 (0 = not included, 1 = included). +Other values are not accepted, and the import of the file in the next processing +step will be declined. The value 0 is the default, and thus, the endpoints to +be included should be marked with 1. + +The latter table contains a field include_species, this controls the inclusion of each species by setting the value to 0 or 1 (0 = not included, 1 = included). Depending on the settings of species_selection, the preset is different: + +- `include_species` is set to 1 + +- `include_species` is set to 0 + +- `include_species` is set to 1 for standard test species and set to 0 for other species + +Review and edit the tables in a preferred spreadsheet program. If changed, +save the changes in the same file. The separator must be `comma`. + +In this step, the database is queried to select the datasets related to the +goals of the analysis. + +The queries can be controlled by the following parameters: + +- `dosing_group`: Specifies the compartment to which the dosing is referenced (so far only "water_concentration", i.e. result is mg/L) + +- `duration_d`: Duration of the exposure in days (e.g. `d`, `dph`, `dpf`) + +- `duration_h`: Duration of the exposure in hours (e.g. `h`, `ht`, `hph`, `hpf`, `hbf`, `hv`) + +- `duration_m`: Duration of the exposure in minutes (e.g. `mi`) + +- `ecotoxgroup`: Species group (e.g. `Algae`, `Crustacean`, `Fish`) + +- `effects`: Effect endpoints (e.g. `MOR`, `GRO`, `DEV`) + +- `habitat`: Habitat of the ecotoxgroup (i.e. `Non-Soil`, `Water`, `Soil`) + +- `kingdoms`: Specification of the `algae` kingdoms (e.g. `Chromista`, `Plantae`, `Monera`) + +- `measurements`: Specification of specific measurements + +- `min_h`: Minimum duration of the experiment in hours + +- `max_h`: Maximum duration of the experiment in hours + +- `min_d`: Minimum duration of the experiment in days + +- `max_d`: Maximum duration of the experiment in days + +- `min_m`: Minimum duration of the experiment in minutes + +- `max_m`: Maximum duration of the experiment in minutes + +- `species_seleciton`: Selection of species (i.e. `all`, `manual`, `standard_test_species`) + +Where `all` selects all species of an `ecotoxgroup`, `manual` expects manual selection in +the files mentioned above and `standard_test_species` selects only species marked as +standardized species. + +### Filtering the data +In the processing step 1, the data in the database is filtered based on the settings +to extract relevant data of the database. + +```{run step 1, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} + +# set the parameters +dosing_group = "water_concentration" # i.e. mg/L (only available group in this version) +duration_d = c("d", "dph", "dpf") +duration_h = c("h", "ht", "hph", "hpf", "hbf", "hv") +duration_m = "mi" +ecotoxgroup = "Algae" # c("Algae", "Crustacean", "Fish") +effects = c("MOR", "GRO", "POP", "REP", "MPH", "DEV") # Algae/Fish +#effects = c("MOR", "GRO", "POP", "REP", "MPH", "DEV", "ITX") # Crustacean +habitat = "Water" #c("Non-Soil","Water","Soil") +kingdoms = NA # vector of specific algae kingdoms: c("Chromista","Plantae","Monera") +measurements = NA # vector of specific measurements +min_h = 0 +min_d = 0 +max_h = 120 +max_d = 5 +min_m = 0 +max_m = 7200 +species_selection = "all" # c("all", "manual", "standard_test_species") + +# run the processing step +project <- REcoTox::process_data(project, + dosing_group = dosing_group, + duration_d = duration_h, + duration_h = duration_h, + duration_m = duration_m, + ecotoxgroup = ecotoxgroup, + effects = effects, + habitat = habitat, + kingdoms = kingdoms, + measurements = measurements, + max_d = max_d, + min_d = min_d, + max_h = max_h, + min_h = min_h, + max_m = max_m, + min_m = min_m, + remove_formulation = FALSE, + save_project_steps = FALSE, + species_selection = species_selection +) + +``` + +This step stores two files in the `project_folder`, `ecotoxgroup_species_selection.csv` +and `ecotoxgroup_endpoint_selection.csv`. The first block of the file is related to the +ecotoxgroup specified. The species selection file contains all species extracted +for review and the endpoint the respective endpoints (e.g. EC50). To include +species or endpoints, mark the data with `1`, otherwise to exclude, mark with `0`. + +### Filtering species and endpoints + +After review and saving the files, run the following command. This command +reads the files and the data is filtered accordingly. + +The units in the database are quite divergent and thus a unit conversion is performed +to transform all units and values to `mg/L`. In case of mol related units, +the transformation is automated so far the chemical and the molecular weight is +already in the database. If not, the file `ecotoxgroup_mol_weight.csv` is exported +to the `project_folder`. + +```{run step 2, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::process_data(project, save_project_steps = FALSE) + +``` + +### Unit conversion + +Review and edit the file `ecotoxgroup_mol_weight.csv` to add the molecular weight to +the list. The ecotoxicity data is interactivitely enriched with chemical information +(e.g. the average mass). + +In best case with data linked to [US EPA CompTox Chemicals Dashboard](https://comptox.epa.gov/dashboard/ "US EPA CompTox Chemicals Dashboard") +for example by using the output of the [batch search](https://comptox.epa.gov/dashboard/batch-search "US EPA CompTox Chemicals Dashboard Batch Search") +according to Figure 1 and Figure 2. + +![Figure1: US EPA CompTox Chemicals Dashboard Batch Search - Enter Identifiers to Search](./figures/Figure_1.png "Figure 1: US EPA CompTox Chemicals Dashboard Batch Search - Enter Identifiers to Search") + +![Figure 2: US EPA CompTox Chemicals Dashboard Batch Search - Recommended selection of identifiers and properties](./figures/Figure_2.png "Figure 2: US EPA CompTox Chemicals Dashboard Batch Search - Recommended selection of identifiers and properties") + +After update of the mol weight table, run the following command to finalise the +unit conversion step. + +```{run step 3, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::process_data(project, save_project_steps = FALSE) + +``` + +### Chemical properties data and final processing + +The former processing step creates a file named `ecotoxgroup_chemical_list.csv`. +Edit this list to include newly added compounds (imputation of phys.-chem. +properties and metadata). + +To score the quality of the data, the solubility domain of the result is +calculated. The calculation requires the experimental or predicted solubility +of the chemical. + +```{run step 4, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::process_data(project, save_project_steps = FALSE, update_chemicals = FALSE) + +``` +The file `ecotoxgroup_final_results.csv` is stored in the `project_folder`. +It contains the results of the processing in the long pivot format. + + +## Preparation of the wide pivot table with the aggregated ecotoxicity information + +For final processing and to aggregate the data in the wide pivot format, +run the following final step. + +```{run step 5, echo = TRUE, message = FALSE, warning = FALSE, eval = FALSE} +project <- REcoTox::aggregate_results(project = project, quantile = 0.05) + +``` + +\newpage + +# SessionInfo + +```{r sessioninfo, echo = TRUE, eval = TRUE, message = FALSE} +sessionInfo() +``` + +# References + +```{r clean_up, echo = FALSE, results = "asis", eval = FALSE} +#unlink(project_folder, recursive = TRUE) +``` diff --git a/vignettes/REcoTox_PDF.pdf b/vignettes/REcoTox_PDF.pdf new file mode 100644 index 0000000..d477b37 Binary files /dev/null and b/vignettes/REcoTox_PDF.pdf differ