diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 275108a1..9cfbc7f2 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -2,7 +2,7 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, master, develop] pull_request: branches: [main, master, develop] @@ -18,17 +18,16 @@ jobs: fail-fast: false matrix: config: - - {os: macos-latest, r: 'release'} - {os: windows-latest, r: 'release'} - - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} - {os: ubuntu-latest, r: 'release'} + - {os: ubuntu-latest, r: 'oldrel-1'} env: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} R_KEEP_PKG_SOURCE: yes steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 @@ -45,4 +44,5 @@ jobs: - uses: r-lib/actions/check-r-package@v2 with: - upload-snapshots: true \ No newline at end of file + upload-snapshots: true + build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' \ No newline at end of file diff --git a/.github/workflows/pkgdown.yaml b/.github/workflows/pkgdown.yaml index ed7650c7..57aba397 100644 --- a/.github/workflows/pkgdown.yaml +++ b/.github/workflows/pkgdown.yaml @@ -22,7 +22,7 @@ jobs: permissions: contents: write steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-pandoc@v2 diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml index 4b654182..2fb743b7 100644 --- a/.github/workflows/test-coverage.yaml +++ b/.github/workflows/test-coverage.yaml @@ -2,9 +2,9 @@ # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help on: push: - branches: [main, master] + branches: [main, master, develop] pull_request: - branches: [main, master] + branches: [main, master, develop] name: test-coverage @@ -15,7 +15,7 @@ jobs: GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - uses: r-lib/actions/setup-r@v2 with: @@ -27,5 +27,24 @@ jobs: needs: coverage - name: Test coverage - run: covr::codecov(quiet = FALSE) + run: | + covr::codecov( + quiet = FALSE, + clean = FALSE, + install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") + ) shell: Rscript {0} + + - name: Show testthat output + if: always() + run: | + ## -------------------------------------------------------------------- + find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true + shell: bash + + - name: Upload test results + if: failure() + uses: actions/upload-artifact@v4 + with: + name: coverage-test-failures + path: ${{ runner.temp }}/package \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index 2368111f..3ba66588 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: APCalign Title: Resolving Plant Taxon Names Using the Australian Plant Census -Version: 0.1.4 +Version: 1.0.0 Authors@R: c( person(given = "Daniel", family = "Falster", role = c("aut", "cre", "cph"), email = "daniel.falster@unsw.edu.au", comment = c(ORCID = "0000-0002-9814-092X")), person(given = "Elizabeth", family = "Wenk", role = c("aut", "ctb"), email = "e.wenk@unsw.edu.au", comment = c(ORCID = "0000-0001-5640-5910")), @@ -18,11 +18,10 @@ Depends: Imports: readr, purrr, - forcats, - tibble, dplyr, stringr, stringi, + stringdist, crayon, httr, jsonlite, @@ -38,9 +37,8 @@ Suggests: kableExtra, here, testthat (>= 3.0.0) -Remotes: apache/arrow/r Roxygen: list(markdown = TRUE) -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Config/testthat/edition: 3 VignetteBuilder: knitr URL: https://traitecoevo.github.io/APCalign/, https://github.com/traitecoevo/APCalign diff --git a/NAMESPACE b/NAMESPACE index 0b9d2cd4..78a24c4f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,23 +3,13 @@ export(align_taxa) export(create_species_state_origin_matrix) export(create_taxonomic_update_lookup) +export(default_version) export(load_taxonomic_resources) export(native_anywhere_in_australia) export(standardise_names) +export(standardise_taxon_rank) export(state_diversity_counts) export(strip_names) -export(strip_names_2) +export(strip_names_extra) export(update_taxonomy) -import(dplyr) -import(stringr) -importFrom(crayon,red) -importFrom(dplyr,arrange) -importFrom(dplyr,distinct) -importFrom(dplyr,filter) -importFrom(dplyr,mutate) -importFrom(dplyr,select) -importFrom(readr,col_character) -importFrom(readr,col_logical) -importFrom(readr,cols) -importFrom(readr,read_csv) -importFrom(tibble,tibble) +importFrom(dplyr,"%>%") diff --git a/NEWS.md b/NEWS.md index 60ae479d..624ae906 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,37 @@ -# APCalign 0.1.4 +# APCalign 1.0.0 +First major release of APCalign. A preprint is available at +https://www.biorxiv.org/content/10.1101/2024.02.02.578715v1. +Article has been accepted for publication at Australian journal of Botany. -* Better handling of errors when API/network connection is down for `load_taxonomic_resources` +Following review, a number of changes have been implemented. These have sped & +streamlined the package. -* Refined testing for `load_taxonomic_resources` +* Update function documentation +* Speed up `extract_genus` +* Write a replacement function for `stringr::word` that is much faster. +* Additional speed up and accuracy of fuzzy_match function by + - Restricting reference list to names with the same first letter as input string. + - Switch from using `utils::adist` to `stringdist:stringdist(method = "dl")` +* Rework `standardise_names` to remove punctuation from the start of the string +* Rework `strip_names_extra` (previously `strip_names_2`) to just perform +additional functions to `strip_names`, rather than repeating those performed by `strip_names`. +* Avoid importing entire packages by using package::function format throughout +and removing functions from @import +* Add fuzzy match arguments to `create_taxonomic_update_lookup` +* Add 3 additional family-level APC matches to `match_taxa`. +* Refine tests +* Make messages to console optional +* Fix issue with fails when github is down (https://github.com/traitecoevo/APCalign/issues/205) +# APCalign 0.1.5 +* Update installation instructions +* Added how to cite and version APCalign as an article +* Exported `default_version` +* Add citing method for R package +* Update GitHub Actions +* Improved family alignments +* Added `standardise_taxon_rank` +* Improved messaging during alignment diff --git a/R/APCalign-package.R b/R/APCalign-package.R index 38b2b88a..0cb29cb4 100644 --- a/R/APCalign-package.R +++ b/R/APCalign-package.R @@ -10,7 +10,8 @@ #' @name APCalign #' @docType package #' @references If you have any questions, comments or suggestions, please -#' submit an issue at our [GitHub repository](https://github.com/traitecoevo/APCalign/issues) +#' submit an issue at our +#' [GitHub repository](https://github.com/traitecoevo/APCalign/issues) #' @keywords internal #' @section Functions: #' **Standarise taxon names** @@ -51,9 +52,12 @@ utils::globalVariables( "checked", "cleaned_name", "family", + "family_accepted", "fuzzy_match_genus", "fuzzy_match_genus_APNI", "fuzzy_match_genus_synonym", + "fuzzy_match_family", + "fuzzy_match_family_synonym", "genus", "genus_accepted", "known", @@ -75,6 +79,8 @@ utils::globalVariables( "taxon_ID", "taxon_ID_aligned", "taxon_rank", + "txtProgressBar", + "setTxtProgressBar", "taxonomic_status", "taxonomic_status_aligned", "taxonomic_status_genus", diff --git a/R/align_taxa.R b/R/align_taxa.R index 90c2fcb0..5d36616e 100644 --- a/R/align_taxa.R +++ b/R/align_taxa.R @@ -1,60 +1,178 @@ -#' For a list of Australian plant names, find taxonomic or scientific name alignments to the APC or APNI through standardizing formatting and fixing spelling errors +#' @title Align Australian plant scientific names to the APC or APNI +#' +#' @description +#' For a list of Australian plant names, find taxonomic or scientific name +#' alignments to the APC or APNI through standardizing formatting and fixing +#' spelling errors. +#' +#' Usage case: Users will run this function if they wish to see the details +#' of the matching algorithms, the many output columns that the matching +#' function compares to as it seeks the best alignment. They may also select +#' this function if they want to adjust the “fuzziness” level for fuzzy +#' matches, options not allowed in create_taxonomic_update_lookup. This +#' function is the first half of create_taxonomic_update_lookup. #' -#' This function finds taxonomic alignments in APC or scientific name alignments in APNI. -#' It uses the internal function `match_taxa` to attempt to match input strings to taxon names in the APC/APNI. -#' It sequentially searches for matches against more than 20 different string patterns, -#' prioritising exact matches (to accepted names as well as synonyms, orthographic variants) over fuzzy matches. -#' It prioritises matches to taxa in the APC over names in the APNI. -#' It identifies string patterns in input names that suggest a name can only be aligned to a genus -#' (hybrids that are not in the APC/ANI; graded species; taxa not identified to species), -#' and indicates these names only have a genus-rank match. +#' @details +#' - This function finds taxonomic alignments in APC or scientific name +#' alignments in APNI. +#' - It uses the internal function `match_taxa` to attempt to match input +#' strings to taxon names in the APC/APNI. +#' - It sequentially searches for matches against more than 20 different string +#' patterns, prioritising exact matches (to accepted names as well as +#' synonyms, orthographic variants) over fuzzy matches. +#' - It prioritises matches to taxa in the APC over names in the APNI. +#' - It identifies string patterns in input names that suggest a name can only +#' be aligned to a genus (hybrids that are not in the APC/ANI; graded species; +#' taxa not identified to species), and indicates these names only have a +#' genus-rank match. +#' +#' Notes: +#' +#' - If you will be running the function APCalign::create_taxonomic_update_lookup +#' many times, it is best to load the taxonomic resources separately using +#' resources <- load_taxonomic_resources(), then add the argument +#' resources = resources +#' - The name Banksia cerrata does not align as the fuzzy matching algorithm +#' does not allow the first letter of the genus and species epithet to change. +#' - With this function you have the option of changing the fuzzy matching +#' parameters. The defaults, with fuzzy matches only allowing changes of 3 +#' (or fewer) characters AND 20% (or less) of characters has been carefully +#' calibrated to catch just about all typos, but very, very rarely mis-align +#' a name. If you wish to introduce less conservative fuzzy matching it is +#' recommended you manually check the aligned names. +#' - It is recommended that you begin with imprecise_fuzzy_matches = FALSE (the +#' default), as quite a few of the less precise fuzzy matches are likely to be +#' erroneous. This argument should be turned on only if you plan to check all +#' alignments manually. +#' - The argument identifier allows you to add a fix text string to all genus- +#' and family- level names, such as identifier = "Royal NP" would return "Acacia +#' sp. \[Royal NP]". #' #' @param original_name A list of names to query for taxonomic alignments. #' @param output (optional) The name of the file to save the results to. #' @param full Parameter to determine how many columns are output -#' @param resources the taxonomic resources used to align the taxa names. Loading this can be slow, -#' so call \code{\link{load_taxonomic_resources}} separately to greatly speed this function up and pass the resources in. -#' @param fuzzy_abs_dist The number of characters allowed to be different for a fuzzy match. -#' @param fuzzy_rel_dist The proportion of characters allowed to be different for a fuzzy match. -#' @param fuzzy_matches Fuzzy matches are turned on as a default. The relative and absolute distances allowed for fuzzy matches to species and infraspecific taxon names are defined by the parameters `fuzzy_abs_dist` and `fuzzy_rel_dist` -#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default. -#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default. -#' @param identifier A dataset, location or other identifier, which defaults to NA. +#' @param resources the taxonomic resources used to align the taxa names. +#' Loading this can be slow, so call \code{\link{load_taxonomic_resources}} +#' separately to greatly speed this function up and pass the resources in. +#' @param quiet Logical to indicate whether to display messages while +#' aligning taxa. +#' @param fuzzy_abs_dist The number of characters allowed to be different for a +#' fuzzy match. +#' @param fuzzy_rel_dist The proportion of characters allowed to be different +#' for a fuzzy match. +#' @param fuzzy_matches Fuzzy matches are turned on as a default. +#' The relative and absolute distances allowed for fuzzy matches to species and +#' infraspecific taxon names are defined by the parameters `fuzzy_abs_dist` +#' and `fuzzy_rel_dist` +#' @param imprecise_fuzzy_matches Imprecise fuzzy matches uses the +#' fuzzy matching function with lenient levels set (absolute distance of +#' 5 characters; relative distance = 0.25). +#' It offers a way to get a wider range of possible names, possibly +#' corresponding to very distant spelling mistakes. +#' This is FALSE as default and all outputs should be checked as it often +#' makes erroneous matches. +#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) +#' are turned on as a default. +#' @param identifier A dataset, location or other identifier, +#' which defaults to NA. #' -#' @return A tibble with columns that include original_name, aligned_name, taxonomic_dataset, taxon_rank, aligned_reason, alignment_code. +#' @return A tibble with columns that include original_name, aligned_name, +#' taxonomic_dataset, taxon_rank, aligned_reason, alignment_code. #' - original_name: the original plant name input. -#' - aligned_name: the original plant name after the function standardise_names has standardised the syntax of infraspecific taxon designations. +#' - aligned_name: the original plant name after the function standardise_names +#' has standardised the syntax of infraspecific taxon designations. #' - taxonomic_dataset: the source of the aligned names (APC or APNI). #' - taxon_rank: the taxonomic rank of the aligned name. -#' - aligned_reason: the explanation of a specific taxon name alignment (from an original name to an aligned name). -#' - alignment_code: a code that accompanies the aligned_reason, indicating the relative sequence of the match during the alignment process. -#' - cleaned_name: original name with punctuation and infraspecific taxon designation terms standardised by the function standardise_names; streamlines exact matches. -#' - stripped_name: cleaned name with punctuation and infraspecific taxon designation terms removed by the function strip_names; improves fuzzy matches. -#' - stripped_name2: cleaned name with punctuation, infraspecific taxon designation terms, and other filler words removed by the function strip_names_2; required for matches to `first two word` and `first three words`. -#' - trinomial: the first three words in `stripped_name2`, required for matches that ignore all other text in the original_name; improves phrase name matches. -#' - binomial: the first two words in `stripped_name2`, required for matches that ignore all other text in the original_name; improves phrase name matches. -#' - genus: the first two words in `cleaned_name`; required for genus-rank matches and reprocessing of genus-rank names. -#' - fuzzy_match_genus: fuzzy match of genus column to best match among APC-accepted names; required for fuzzy matches of genus-rank names. -#' - fuzzy_match_genus_synonym: fuzzy match of genus column to best match among APC-known names, only considering different matches to those documented under APC-accepted genera; required for fuzzy matches of genus-rank names. -#' - fuzzy_match_genus_APNI: fuzzy match of genus column to best match among APNI names, only considering different matches to those documented under APC-accepted and APC-known genera; required for fuzzy matches of genus-rank names. -#' - fuzzy_match_cleaned_APC: fuzzy match of stripped_name to APC-accepted names; created for yet-to-be-aligned names at the match step 07a in the function `match_taxa`. -#' - fuzzy_match_cleaned_APC_synonym: fuzzy match of stripped_name to APC-known names; created for yet-to-be-aligned names at the match step 07b in the function `match_taxa`. -#' - fuzzy_match_cleaned_APC_imprecise: imprecise fuzzy match of stripped_name to APC-accepted names; created for yet-to-be-aligned names at the match step 10a in the function `match_taxa`. -#' - fuzzy_match_cleaned_APC_synonym_imprecise: imprecise fuzzy match of stripped_name to APC-accepted names; created for yet-to-be-aligned names at the match step 10b in the function `match_taxa`. -#' - fuzzy_match_binomial: fuzzy match of binomial column to best match among APC-accepted names; created for yet-to-be-aligned names at match step 15a in the function `match_taxa`. -#' - fuzzy_match_binomial_APC_synonym: fuzzy match of binomial column to best match among APC-known names; created for yet-to-be-aligned names at match step 15a in the function `match_taxa`. -#' - fuzzy_match_trinomial: fuzzy match of trinomial column to best match among APC-accepted names; created for yet-to-be-aligned names at match step 16a in the function `match_taxa`. -#' - fuzzy_match_trinomial_synonym: fuzzy match of trinomial column to best match among APC-known names; created for yet-to-be-aligned names at match step 16b in the function `match_taxa`. -#' - fuzzy_match_cleaned_APNI: fuzzy match of stripped_name to APNI names; created for yet-to-be-aligned names at the match step 16a in the function `match_taxa`. -#' - fuzzy_match_cleaned_APNI_imprecise: imprecise fuzzy match of stripped_name to APNI names; created for yet-to-be-aligned names at the match step 17a in the function `match_taxa`. +#' - aligned_reason: the explanation of a specific taxon name alignment +#' (from an original name to an aligned name). +#' - alignment_code: a code that accompanies the aligned_reason, indicating the +#' relative sequence of the match during the alignment process. +#' - cleaned_name: original name with punctuation and infraspecific taxon +#' designation terms standardised by the function standardise_names; +#' streamlines exact matches. +#' - stripped_name: cleaned name with punctuation and infraspecific taxon +#' designation terms removed by the function strip_names; +#' improves fuzzy matches. +#' - stripped_name2: cleaned name with punctuation, infraspecific taxon +#' designation terms, and other filler words removed by +#' the function `strip_names_extra`; +#' required for matches to `first two word` and `first three words`. +#' - trinomial: the first three words in `stripped_name2`, required for matches +#' that ignore all other text in the original_name; +#' improves phrase name matches. +#' - binomial: the first two words in `stripped_name2`, required for matches +#' that ignore all other text in the original_name; +#' improves phrase name matches. +#' - genus: the first two words in `cleaned_name`; +#' required for genus-rank matches and reprocessing of genus-rank names. +#' - fuzzy_match_genus: fuzzy match of genus column to best match among +#' APC-accepted names; +#' required for fuzzy matches of genus-rank names. +#' - fuzzy_match_genus_synonym: fuzzy match of genus column to best match among +#' APC-synonymous names, only considering different matches to those documented +#' under APC-accepted genera; required for fuzzy matches of genus-rank names. +#' - fuzzy_match_genus_APNI: fuzzy match of genus column to best match among +#' APNI names, only considering different matches to those documented under +#' APC-accepted and APC-known genera; required for fuzzy matches of +#' genus-rank names. +#' - fuzzy_match_family: fuzzy match of genus column to best match among +#' APC-accepted family names; required for fuzzy matches of family-rank names. +#' - fuzzy_match_family_synonym: fuzzy match of genus column to best match +#' among APC-synonymous family names; required for fuzzy matches of +#' family-rank names. +#' - fuzzy_match_cleaned_APC: fuzzy match of stripped_name to APC-accepted +#' names; created for yet-to-be-aligned names at the match step 05a +#' in the function `match_taxa`. +#' - fuzzy_match_cleaned_APC_synonym: fuzzy match of stripped_name to +#' APC-synonymous names; created for yet-to-be-aligned names at the +#' match step 05b in the function `match_taxa`. +#' - fuzzy_match_cleaned_APC_imprecise: imprecise fuzzy match of stripped_name +#' to APC-accepted names; created for yet-to-be-aligned names at the +#' match step 07a in the function `match_taxa`. +#' - fuzzy_match_cleaned_APC_synonym_imprecise: imprecise fuzzy match of +#' stripped_name to APC-accepted names; created for yet-to-be-aligned names +#' at the match step 07b in the function `match_taxa`. +#' - fuzzy_match_binomial: fuzzy match of binomial column to best match among +#' APC-accepted names; created for yet-to-be-aligned names at +#' match step 10c in the function `match_taxa`. +#' - fuzzy_match_binomial_APC_synonym: fuzzy match of binomial column to best +#' match among APC-synonymous names; created for yet-to-be-aligned names at +#' match step 10d in the function `match_taxa`. +#' - fuzzy_match_trinomial: fuzzy match of trinomial column to best match +#' among APC-accepted names; created for yet-to-be-aligned names at +#' match step 09c in the function `match_taxa`. +#' - fuzzy_match_trinomial_synonym: fuzzy match of trinomial column to best +#' match among APC-synonymous names; created for yet-to-be-aligned names at +#' match step 09d in the function `match_taxa`. +#' - fuzzy_match_cleaned_APNI: fuzzy match of stripped_name to APNI names; +#' created for yet-to-be-aligned names at the match step 11a in the +#' function `match_taxa`. +#' - fuzzy_match_cleaned_APNI_imprecise: imprecise fuzzy match of +#' stripped_name to APNI names; created for yet-to-be-aligned names +#' at the match step 11b in the function `match_taxa`. #' #' @export #' #' @examples -#' \donttest{align_taxa(c("Poa annua", "Abies alba"))} -#' -#' @importFrom readr read_csv cols col_logical col_character -#' @importFrom tibble tibble +#' \donttest{ +#' resources <- load_taxonomic_resources() +#' +#' # example 1 +#' align_taxa(c("Poa annua", "Abies alba"), resources = resources) +#' +#' # example 2 +#' input <- c("Banksia serrata", "Banksia serrate", "Banksia cerrata", +#' "Banksia serrrrata", "Dryandra sp.", "Banksia big red flowers") +#' +#' aligned_taxa <- +#' APCalign::align_taxa( +#' original_name = input, +#' identifier = "APCalign test", +#' full = TRUE, +#' resources = resources +#' ) +#' +#' } #' #' #' @seealso @@ -69,6 +187,7 @@ align_taxa <- function(original_name, output = NULL, full = FALSE, resources = load_taxonomic_resources(), + quiet = FALSE, fuzzy_abs_dist = 3, fuzzy_rel_dist = 0.2, fuzzy_matches = TRUE, @@ -76,10 +195,14 @@ align_taxa <- function(original_name, APNI_matches = TRUE, identifier = NA_character_) { - message("Checking alignments of ", dplyr::n_distinct(original_name, na.rm = TRUE), " taxa\n") + if(!quiet) + message("Checking alignments of ", + dplyr::n_distinct(original_name, na.rm = TRUE), + " taxa\n") if (!is.null(output) && file.exists(output)) { - message(" - reading existing data from ", output) + if(!quiet) + message(" - reading existing data from ", output) taxa_raw <- readr::read_csv( @@ -90,16 +213,27 @@ align_taxa <- function(original_name, .default = readr::col_character() ) ) - - # TODO: check taxa_ raw has correct columns + correct_names <- c("original_name", "aligned_name", "accepted_name", + "suggested_name", "genus", "family", "taxon_rank", + "taxonomic_dataset", "taxonomic_status", + "taxonomic_status_aligned", "aligned_reason", + "update_reason", "subclass", "taxon_distribution", + "scientific_name", "taxon_ID", "taxon_ID_genus", + "scientific_name_ID", "canonical_name", "row_number", + "number_of_collapsed_taxa", "checked", "known") + if(!identical(names(taxa_raw), correct_names)) { + stop("Your output file already exists and it's not in the right format. + Please check that the file you are passing in to the output option.") + } } else { taxa_raw <- - tibble::tibble( + dplyr::tibble( original_name = character(0L), cleaned_name = character(0L), aligned_name = character(0L), taxonomic_dataset = character(0L), + identifier = character(0L), known = logical(0L), checked = logical(0L) ) @@ -111,13 +245,14 @@ align_taxa <- function(original_name, taxa[["tocheck"]] <- dplyr::bind_rows( taxa_raw, - tibble::tibble( + dplyr::tibble( original_name = # only include new names subset(original_name, !is.na(original_name) & !original_name %in% taxa_raw$original_name ), + identifier = identifier, cleaned_name = NA_character_, stripped_name = NA_character_, stripped_name2 = NA_character_, @@ -129,6 +264,8 @@ align_taxa <- function(original_name, fuzzy_match_genus = NA_character_, fuzzy_match_genus_synonym = NA_character_, fuzzy_match_genus_APNI = NA_character_, + fuzzy_match_family = NA_character_, + fuzzy_match_family_synonym = NA_character_, fuzzy_match_binomial = NA_character_, fuzzy_match_binomial_APC_synonym = NA_character_, fuzzy_match_trinomial = NA_character_, @@ -146,35 +283,64 @@ align_taxa <- function(original_name, known = FALSE ) ) %>% - # take unique values so each name only processed once - dplyr::filter(!duplicated(original_name)) + # take unique values of original name by identifier combinations + # so each name only processed once (or multiple times if unique identifiers) + dplyr::filter(!duplicated(paste0(original_name, identifier))) %>% + dplyr::filter(original_name %>% standardise_names() != "") - if (all(taxa$tocheck$checked)) { - message(" - all taxa are already checked, yay!") + if (all(taxa$tocheck$checked)|all(is.na(taxa$tocheck$checked))) { + if(!quiet) + message(" - all taxa are already checked, yay!") return(invisible(taxa$tocheck)) } # move all checked taxa to "checked" taxa <- redistribute(taxa) - # check unknown taxa - message( + # messages if there is an saved list being added to + if ( + !is.null(output) && + file.exists(output) && + !all(taxa$tocheck$checked) && + !quiet + ) { + # check unknown taxa + message( " -> ", - crayon::blue(sum(taxa$tocheck$known, na.rm = T)), + crayon::blue(sum(!is.na(taxa$checked$accepted_name), na.rm = T)), " names already matched; ", crayon::blue(sum( - taxa$tocheck$checked & - !taxa$tocheck$known, + is.na(taxa$checked$accepted_name), na.rm = T )), - " names checked but without a match; ", - crayon::blue(sum(!taxa$tocheck$checked)), + " names checked but without a species-level match; ", + crayon::blue(sum(!is.na(taxa$tocheck$original_name))), " taxa yet to be checked" ) + } + + # otherwise if there are taxa that require checking add + # simple message that indicates number of perfect matches. + if (!all(taxa$tocheck$checked)) { + + perfect_matches <- taxa$tocheck %>% + dplyr::filter(original_name %in% resources$`APC list (accepted)`$canonical_name) %>% + dplyr::distinct(original_name) %>% + nrow() + if(!quiet) + message( + " -> of these ", + crayon::blue(perfect_matches), + " names have a perfect match to a scientific name in the APC. + Alignments being sought for remaining names." + ) + } + # do the actual matching taxa <- - match_taxa(taxa, resources, fuzzy_abs_dist, fuzzy_rel_dist, fuzzy_matches, imprecise_fuzzy_matches, APNI_matches, identifier) %>% + match_taxa(taxa, resources, fuzzy_abs_dist, fuzzy_rel_dist, fuzzy_matches, + imprecise_fuzzy_matches, APNI_matches, identifier) %>% # reassemble dplyr::bind_rows() %>% dplyr::mutate(known = !is.na(aligned_name)) @@ -183,24 +349,33 @@ align_taxa <- function(original_name, taxa <- taxa %>% dplyr::select(-genus, -known, -checked) %>% - dplyr::select(original_name, cleaned_name, aligned_name, taxonomic_dataset, taxon_rank, aligned_reason, alignment_code, everything()) + dplyr::select(original_name, cleaned_name, aligned_name, + taxonomic_dataset, taxon_rank, aligned_reason, + alignment_code, dplyr::everything()) } else { taxa <- taxa %>% - dplyr::select(original_name, cleaned_name, aligned_name, taxonomic_dataset, taxon_rank, aligned_reason, alignment_code) + dplyr::select(original_name, cleaned_name, aligned_name, + taxonomic_dataset, taxon_rank, aligned_reason, + alignment_code, identifier) } # Assemble output in the order of the input # by joining results into a tibble with inputs as column taxa <- - dplyr::tibble(original_name = original_name) %>% - dplyr::left_join(by = "original_name", taxa) + dplyr::tibble(original_name = original_name, identifier = identifier) %>% + dplyr::left_join(by = c("original_name", "identifier"), taxa) %>% + # can remove column identifier now that matches are complete + dplyr::select(-identifier) ## save outputs to file, useful for caching results if (!is.null(output)) { dir.create(dirname(output), FALSE, TRUE) + taxa$checked<-TRUE + taxa$known<-!is.na(taxa$aligned_name) readr::write_csv(taxa, output) - message(" - output saved in file: ", output) + if(!quiet) + message(" - output saved in file: ", output) } return(taxa) @@ -209,7 +384,8 @@ align_taxa <- function(original_name, # function moves taxa from tocheck to checked redistribute <- function(data) { data[["checked"]] <- dplyr::bind_rows(data[["checked"]], - data[["tocheck"]] %>% dplyr::filter(checked)) + data[["tocheck"]] %>% + dplyr::filter(checked)) data[["tocheck"]] <- data[["tocheck"]] %>% dplyr::filter(!checked) diff --git a/R/create_species_state_origin_matrix.R b/R/create_species_state_origin_matrix.R index 59e6f564..c3e20bac 100644 --- a/R/create_species_state_origin_matrix.R +++ b/R/create_species_state_origin_matrix.R @@ -1,16 +1,22 @@ -#' Use the taxon distribution data from the APC to determine state level native and introduced origin status +#' @title State level native and introduced origin status +#' +#' @description +#' This function uses the taxon distribution data from the APC to determine +#' state level native and introduced origin status. #' #' This function processes the geographic data available in the APC and #' returns state level native, introduced and more complicated origins status for all taxa. #' #' #' @family diversity methods -#' @param resources the taxonomic resources required to make the summary statistics. Loading this can be slow, so call load_taxonomic_resources separately to greatly speed this function up and pass the resources in. +#' @param resources the taxonomic resources required to make the summary statistics. +#' Loading this can be slow, so call load_taxonomic_resources separately to greatly +#' speed this function up and pass the resources in. #' -#' @return A tibble with columns representing each state and rows representing each species. The values in each cell represent the origin of the species in that state. +#' @return A tibble with columns representing each state and rows representing each +#' species. The values in each cell represent the origin of the species in that state. #' -#' @import dplyr -#' @import stringr +#' #' @export #' #' @seealso \code{\link{load_taxonomic_resources}} @@ -44,13 +50,13 @@ separate_states <- function(data) { #' @noRd identify_places <- function(sep_state_data) { all_codes <- unique(stringr::str_trim(unlist(sep_state_data))) - unique(stringr::word(all_codes[!is.na(all_codes)], 1, 1)) + unique(word(all_codes[!is.na(all_codes)], 1, 1)) } #' @noRd create_species_df <- function(apc_places, apc_species) { species_df <- dplyr::tibble(species = apc_species$canonical_name) - for (i in 1:length(apc_places)) { + for (i in seq_along(apc_places)) { species_df <- dplyr::bind_cols(species_df, NA, .name_repair = "minimal") } names(species_df) <- c("species", apc_places) @@ -76,7 +82,7 @@ state_parse_and_add_column <- function(species_df, state, apc_species) { #' @noRd parse_states <- function(species_df, apc_places, apc_species) { - for (i in 1:length(apc_places)) { + for (i in seq_along(apc_places)) { species_df <- state_parse_and_add_column(species_df, apc_places[i], apc_species) } return(species_df) diff --git a/R/create_taxonomic_update_lookup.R b/R/create_taxonomic_update_lookup.R index 8fa2577b..0b3d3349 100644 --- a/R/create_taxonomic_update_lookup.R +++ b/R/create_taxonomic_update_lookup.R @@ -1,63 +1,174 @@ -#' Create a lookup table with the best-possible scientific name match for a list of Australian plant names +#' @title Create a table with the best-possible scientific name match for +#' Australian plant names #' -#' This function takes a list of Australian plant names that need to be reconciled with current taxonomy and -#' generates a lookup table of the best-possible scientific name match for each input name. -#' It uses first the function `align_taxa`, then the function `update_taxonomy` to achieve the output. +#' @description +#' This function takes a list of Australian plant names that need to be +#' reconciled with current taxonomy and generates a lookup table of the +#' best-possible scientific name match for each input name. +#' +#' Usage case: This is APCalign’s core function, merging together the alignment +#' and updating of taxonomy. +#' +#' @details +#' - It uses first the function `align_taxa`, then the function `update_taxonomy` +#' to achieve the output. The aligned name is plant name that has been aligned +#' to a taxon name in the APC or APNI by the align_taxa function. +#' +#' Notes: +#' +#' - If you will be running the function APCalign::create_taxonomic_update_lookup +#' many times, it is best to load the taxonomic resources separately using +#' `resources <- load_taxonomic_resources()`, then add the argument +#' resources = resources +#' - The name Banksia cerrata does not align as the fuzzy matching algorithm +#' does not allow the first letter of the genus and species epithet to change. +#' - The argument taxonomic_splits allows you to choose the outcome for updating +#' the names of taxa with ambiguous taxonomic histories; this applies to +#' scientific names that were once attached to a more broadly circumscribed +#' taxon concept, that was then split into several more narrowly circumscribed +#' taxon concepts, one of which retains the original name. There are three +#' options: most_likely_species returns the name that is retained, with +#' alternative names documented in square brackets; return_all adds additional +#' rows to the output, one for each possible taxon concept; +#' collapse_to_higher_taxon returns the genus with possible names in square +#' brackets. +#' - The argument identifier allows you to add a fix text string to all genus- +#' and family- level names, such as identifier = "Royal NP" would return +#' `Acacia sp. \[Royal NP]`. #' #' @family taxonomic alignment functions #' -#' @param taxa A list of Australian plant species that needs to be reconciled with current taxonomy. -#' @param stable_or_current_data either "stable" for a consistent version, or "current" for the leading edge version. +#' @param taxa A list of Australian plant species that needs to be reconciled +#' with current taxonomy. +#' @param stable_or_current_data either "stable" for a consistent version, +#' or "current" for the leading edge version. #' @param version The version number of the dataset to use. -#' @param taxonomic_splits How to handle one_to_many taxonomic matches. Default is "return_all". The other options are "collapse_to_higher_taxon" and "most_likely_species". most_likely_species defaults to the original_name if that name is accepted by the APC; this will be right for certain species subsets, but make errors in other cases, use with caution. -#' @param full logical for whether the full lookup table is returned or just key columns -#' @param resources These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in. -#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default. -#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default. -#' @param identifier A dataset, location or other identifier, which defaults to NA. -#' @param output file path to save the intermediate output to -#' @return A lookup table containing the accepted and suggested names for each original name input, and additional taxonomic information such as taxon rank, taxonomic status, taxon IDs and genera. +#' @param taxonomic_splits How to handle one_to_many taxonomic matches. +#' Default is "return_all". The other options are "collapse_to_higher_taxon" +#' and "most_likely_species". most_likely_species defaults to the original_name +#' if that name is accepted by the APC; this will be right for certain species +#' subsets, but make errors in other cases, use with caution. +#' @param full logical for whether the full lookup table is returned or +#' just key columns +#' @param fuzzy_abs_dist The number of characters allowed to be different for +#' a fuzzy match. +#' @param fuzzy_rel_dist The proportion of characters allowed to be different +#' for a fuzzy match. +#' @param fuzzy_matches Fuzzy matches are turned on as a default. The relative +#' and absolute distances allowed for fuzzy matches to species and +#' infraspecific taxon names are defined by the parameters `fuzzy_abs_dist` +#' and `fuzzy_rel_dist`. +#' @param resources These are the taxonomic resources used for cleaning, this +#' will default to loading them from a local place on your computer. If this is +#' to be called repeatedly, it's much faster to load the resources using +#' \code{\link{load_taxonomic_resources}} separately and pass the data in. +#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) +#' are turned off as a default. +#' @param imprecise_fuzzy_matches Imprecise fuzzy matches uses the fuzzy +#' matching function with lenient levels set (absolute distance of +#' 5 characters; relative distance = 0.25). +#' It offers a way to get a wider range of possible names, possibly +#' corresponding to very distant spelling mistakes. +#' This is FALSE as default and all outputs should be checked as it often +#' makes erroneous matches. +#' @param identifier A dataset, location or other identifier, +#' which defaults to NA. +#' @param quiet Logical to indicate whether to display messages while +#' aligning taxa. +#' @param output file path to save the output. If this file already exists, +#' this function will check if it's a subset of the species passed in and try +#' to add to this file. This can be useful for large and growing projects. +#' @return A lookup table containing the accepted and suggested names for each +#' original name input, and additional taxonomic information such as taxon +#' rank, taxonomic status, taxon IDs and genera. #' - original_name: the original plant name. -#' - aligned_name: the input plant name that has been aligned to a taxon name in the APC or APNI by the align_taxa function. +#' - aligned_name: the input plant name that has been aligned to a taxon name in +#' the APC or APNI by the align_taxa function. #' - accepted_name: the APC-accepted plant name, when available. -#' - suggested_name: the suggested plant name to use. Identical to the accepted_name, when an accepted_name exists; otherwise the the suggested_name is the aligned_name. -#' - genus: the genus of the accepted (or suggested) name; only APC-accepted genus names are filled in. -#' - family: the family of the accepted (or suggested) name; only APC-accepted family names are filled in. +#' - suggested_name: the suggested plant name to use. Identical to the +#' accepted_name, when an accepted_name exists; +#' otherwise the the suggested_name is the aligned_name. +#' - genus: the genus of the accepted (or suggested) name; +#' only APC-accepted genus names are filled in. +#' - family: the family of the accepted (or suggested) name; +#' only APC-accepted family names are filled in. #' - taxon_rank: the taxonomic rank of the suggested (and accepted) name. -#' - taxonomic_dataset: the source of the suggested (and accepted) names (APC or APNI). +#' - taxonomic_dataset: the source of the suggested (and accepted) names +#' (APC or APNI). #' - taxonomic_status: the taxonomic status of the suggested (and accepted) name. -#' - taxonomic_status_aligned: the taxonomic status of the aligned name, before any taxonomic updates have been applied. -#' - aligned_reason: the explanation of a specific taxon name alignment (from an original name to an aligned name). -#' - update_reason: the explanation of a specific taxon name update (from an aligned name to an accepted or suggested name). +#' - taxonomic_status_aligned: the taxonomic status of the aligned name, +#' before any taxonomic updates have been applied. +#' - aligned_reason: the explanation of a specific taxon name alignment +#' (from an original name to an aligned name). +#' - update_reason: the explanation of a specific taxon name update +#' (from an aligned name to an accepted or suggested name). #' - subclass: the subclass of the accepted name. -#' - taxon_distribution: the distribution of the accepted name; only filled in if an APC accepted_name is available. -#' - scientific_name_authorship: the authorship information for the accepted (or synonymous) name; available for both APC and APNI names. -#' - taxon_ID: the unique taxon concept identifier for the accepted_name; only filled in if an APC accepted_name is available. -#' - taxon_ID_genus: an identifier for the genus; only filled in if an APC-accepted genus name is available. -#' - scientific_name_ID: an identifier for the nomenclatural (not taxonomic) details of a scientific name; available for both APC and APNI names. +#' - taxon_distribution: the distribution of the accepted name; +#' only filled in if an APC accepted_name is available. +#' - scientific_name_authorship: the authorship information for the accepted +#' (or synonymous) name; available for both APC and APNI names. +#' - taxon_ID: the unique taxon concept identifier for the accepted_name; +#' only filled in if an APC accepted_name is available. +#' - taxon_ID_genus: an identifier for the genus; +#' only filled in if an APC-accepted genus name is available. +#' - scientific_name_ID: an identifier for the nomenclatural (not taxonomic) +#' details of a scientific name; available for both APC and APNI names. #' - row_number: the row number of a specific original_name in the input. -#' - number_of_collapsed_taxa: when taxonomic_splits == "collapse_to_higher_taxon", the number of possible taxon names that have been collapsed. +#' - number_of_collapsed_taxa: when taxonomic_splits == "collapse_to_higher_taxon", +#' the number of possible taxon names that have been collapsed. #' #' @export #' #' @seealso \code{\link{load_taxonomic_resources}} #' @examples -#' \donttest{resources <- load_taxonomic_resources() +#' \donttest{ +#' resources <- load_taxonomic_resources() +#' +#' # example 1 #' create_taxonomic_update_lookup(c("Eucalyptus regnans", #' "Acacia melanoxylon", #' "Banksia integrifolia", #' "Not a species"), -#' resources=resources) -#'} +#' resources = resources) +#' +#' # example 2 +#' input <- c("Banksia serrata", "Banksia serrate", "Banksia cerrata", +#' "Banksea serrata", "Banksia serrrrata", "Dryandra") +#' +#' create_taxonomic_update_lookup( +#' taxa = input, +#' identifier = "APCalign test", +#' full = TRUE, +#' resources = resources +#' ) +#' +#' # example 3 +#' taxon_list <- +#' readr::read_csv( +#' system.file("extdata", "test_taxa.csv", package = "APCalign"), +#' show_col_types = FALSE) +#' +#' create_taxonomic_update_lookup( +#' taxa = taxon_list$original_name, +#' identifier = taxon_list$notes, +#' full = TRUE, +#' resources = resources +#' ) +#' } +#' create_taxonomic_update_lookup <- function(taxa, stable_or_current_data = "stable", version = default_version(), taxonomic_splits = "most_likely_species", full = FALSE, + fuzzy_abs_dist = 3, + fuzzy_rel_dist = 0.2, + fuzzy_matches = TRUE, APNI_matches = TRUE, imprecise_fuzzy_matches = FALSE, identifier = NA_character_, resources = load_taxonomic_resources(), + quiet = FALSE, output = NULL) { validate_taxonomic_splits_input(taxonomic_splits) @@ -66,12 +177,18 @@ create_taxonomic_update_lookup <- function(taxa, align_taxa(taxa, resources = resources, APNI_matches = APNI_matches, identifier = identifier, - imprecise_fuzzy_matches = imprecise_fuzzy_matches) + fuzzy_abs_dist = fuzzy_abs_dist, + fuzzy_rel_dist = fuzzy_rel_dist, + fuzzy_matches = fuzzy_matches, + imprecise_fuzzy_matches = imprecise_fuzzy_matches, + quiet = quiet, + output=output) updated_data <- update_taxonomy(aligned_data, taxonomic_splits = taxonomic_splits, - resources = resources, + resources = resources, + quiet = quiet, output = output) if (!full) { @@ -79,8 +196,11 @@ create_taxonomic_update_lookup <- function(taxa, updated_data %>% dplyr::select( dplyr::any_of(c( - "original_name", "aligned_name", "accepted_name", "suggested_name", "genus", "taxon_rank", "taxonomic_dataset", "taxonomic_status", "scientific_name", "aligned_reason", "update_reason", - "alternative_possible_names", "possible_names_collapsed", "number_of_collapsed_taxa" + "original_name", "aligned_name", "accepted_name", "suggested_name", + "genus", "taxon_rank", "taxonomic_dataset", "taxonomic_status", + "scientific_name", "aligned_reason", "update_reason", + "alternative_possible_names", "possible_names_collapsed", + "number_of_collapsed_taxa" )) ) } @@ -100,7 +220,8 @@ validate_taxonomic_splits_input <- function(taxonomic_splits) { paste( "Invalid input:", taxonomic_splits, - ". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or 'most_likely_species'." + ". Valid inputs are 'return_all', 'collapse_to_higher_taxon', or + 'most_likely_species'." ) ) } diff --git a/R/fuzzy_match.R b/R/fuzzy_match.R index 28cc48e1..7994aa7f 100644 --- a/R/fuzzy_match.R +++ b/R/fuzzy_match.R @@ -1,27 +1,38 @@ #' Fuzzy match taxonomic names #' -#' This function attempts to match input strings to a list of allowable taxonomic names. -#' It requires that the first letter (or digit) of each word is identical between the input and output strings to avoid mis-matches +#' This function attempts to match input strings to a list of allowable +#' taxonomic names. +#' It requires that the first letter (or digit) of each word is identical +#' between the input and output strings to avoid mis-matches #' #' @param txt The string of text requiring a match #' @param accepted_list The list of accepted names attempting to match to -#' @param max_distance_abs The maximum allowable number of characters differing between the input string and the match -#' @param max_distance_rel The maximum proportional difference between the input string and the match +#' @param max_distance_abs The maximum allowable number of characters +#' differing between the input string and the match +#' @param max_distance_rel The maximum proportional difference between the +#' input string and the match #' @param n_allowed The number of allowable matches returned. Defaults to 1 -#' @param epithet_letters A string specifying if 1 or 2 letters remain fixed at the start of the species epithet. +#' @param epithet_letters A string specifying if 1 or 2 letters remain fixed +#' at the start of the species epithet. #' -#' @return A text string that matches a recognised taxon name or scientific name +#' @return A text string that matches a recognised taxon name or scientific +#' name +#' #' -#' #' @examples #' fuzzy_match("Baksia serrata", c("Banksia serrata", -#' "Banksia integrifolia"), +#' "Banksia integrifolia"), #' max_distance_abs = 1, #' max_distance_rel = 1) #' #' @noRd -fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel, n_allowed = 1, epithet_letters = 1) { - +fuzzy_match <- function(txt, accepted_list, + max_distance_abs, + max_distance_rel, + n_allowed = 1, + epithet_letters = 1 + ) { + if (!epithet_letters %in% c(1,2)) { stop("Epithet must be 1 or 2.") } @@ -29,14 +40,18 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel, words_in_text <- 1 + stringr::str_count(txt," ") ## extract first letter of first word - txt_word1_start <- stringr::str_extract(txt, "[:alpha:]") + txt_word1_start <- stringr::str_extract(txt, "[:alpha:]") %>% + stringr::str_to_lower() - ## for text matches with 2 or more words, extract the first letter/digit of the second word + ## for text matches with 2 or more words, + ## extract the first letter/digit of the second word if(words_in_text > 1 & epithet_letters == 2) {if(nchar(word(txt,2)) == 1) { - txt_word2_start <- stringr::str_extract(word(txt,2), "[:alpha:]|[:digit:]") + txt_word2_start <- stringr::str_extract(word(txt,2), + "[:alpha:]|[:digit:]") } else { - txt_word2_start <- stringr::str_extract(word(txt,2), "[:alpha:][:alpha:]|[:digit:]") + txt_word2_start <- stringr::str_extract(word(txt,2), + "[:alpha:][:alpha:]|[:digit:]") } } @@ -44,79 +59,124 @@ fuzzy_match <- function(txt, accepted_list, max_distance_abs, max_distance_rel, txt_word2_start <- stringr::str_extract(word(txt,2), "[:alpha:]|[:digit:]") } - ## for text matches with 3 or more words, extract the first letter/digit of the third word + ## for text matches with 3 or more words, + ## extract the first letter/digit of the third word if(words_in_text > 2) { txt_word3_start <- stringr::str_extract(word(txt,3), "[:alpha:]|[:digit:]") } - ## identify the number of characters that must change for the text string to match each of the possible accepted names - distance_c <- utils::adist(txt, accepted_list, fixed=TRUE)[1,] + ## subset accepted list to taxa that begin with the same first letter to + ## reduce the number of fuzzy matches that are made in the next step. + ## has also wanted to do this for the second word, but then need to separate + ## different lists of reference names - smaller time saving and not worth it. + ## need to add `unique`, because for `APC-known`, + ## sometimes duplicate canonical names each with a different taxonomic + ## status, and then you just want to retain the first one + accepted_list <- accepted_list[(stringr::str_extract(accepted_list, "[:alpha:]") %>% + stringr::str_to_lower()) == + (txt_word1_start %>% stringr::str_to_lower())] %>% + unique() + + ## identify the number of characters that must change for the text string to + ## match each of the possible accepted names + if (length(accepted_list) > 0) { + distance_c <- stringdist::stringdist(txt, accepted_list, method = "dl") - ## identify the minimum number of characters that must change for the text string to match a string in the list of accepted names + ## identify the minimum number of characters that must change for the text + ## string to match a string in the list of accepted names min_dist_abs_c <- min(distance_c) min_dist_per_c <- min(distance_c) / stringr::str_length(txt) i <- which(distance_c==min_dist_abs_c) + potential_matches <- accepted_list[i] - if( + ## Is there an acceptable fuzzy match? if not, break here + if(!( ## Within allowable number of characters (absolute) min_dist_abs_c <= max_distance_abs & ## Within allowable number of characters (relative) min_dist_per_c <= max_distance_rel & - ## Is a unique solution - length(i)<= n_allowed - ) { + ## Solution has up to n_allowed matches + length(potential_matches) <= n_allowed + ) ) { + return(NA) + } + + } else { + return(NA) + } + + # function to check if a match is ok + check_match <- function(potential_match) { + ## identify number of words in the matched string - words_in_match <- 1 + stringr::str_count(accepted_list[i]," ") + words_in_match <- 1 + stringr::str_count(potential_match," ") ## identify the first letter of the first word in the matched string - match_word1_start <- stringr::str_extract(accepted_list[i], "[:alpha:]") + match_word1_start <- stringr::str_extract(potential_match, "[:alpha:]") %>% + stringr::str_to_lower() - ## identify the first letter of the second word in the matched string (if the matched string includes 2+ words) + ## identify the first letter of the second word in the matched string + ## (if the matched string includes 2+ words) if(words_in_text > 1 & epithet_letters == 2) { - if(nchar(word(accepted_list[i],2)) == 1) { - match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]") + x <- word(potential_match,2) + if(nchar(x) == 1) { + match_word2_start <- stringr::str_extract(x, "[:alpha:]|[:digit:]") } else { - match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:][:alpha:]|[:digit:]") + match_word2_start <- stringr::str_extract(x, "[:alpha:][:alpha:]|[:digit:]") } } if(words_in_text > 1 & epithet_letters == 1) { - match_word2_start <- stringr::str_extract(word(accepted_list[i],2), "[:alpha:]|[:digit:]") + match_word2_start <- stringr::str_extract(word(potential_match,2), "[:alpha:]|[:digit:]") } - ## identify the first letter of the third word in the matched string (if the matched string includes 3+ words) + ## identify the first letter of the third word in the matched string + ## (if the matched string includes 3+ words) if(words_in_text > 2) { - match_word3_start <- stringr::str_extract(word(accepted_list[i],3), "[:alpha:]|[:digit:]") + match_word3_start <- stringr::str_extract(word(potential_match,3), "[:alpha:]|[:digit:]") } - keep = FALSE - - ## keep match if the first letters of the first three words (or fewer if applicable) in the string to match - ## are identical to the first letters of the first three words in the matched string + ## keep match if the first letters of the first three words + ## (or fewer if applicable) in the string to match are identical to the + ## first letters of the first three words in the matched string if(words_in_text == 1) { - if (txt_word1_start == match_word1_start) { - keep = TRUE } + ## next line is no longer being used, + ## since only comparing to first-letter matches + if (txt_word1_start == match_word1_start) { + return(TRUE) + } } else if(words_in_text == 2) { - if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) { - keep = TRUE } - + if ( + txt_word1_start == match_word1_start & + txt_word2_start == match_word2_start + ) { + return(TRUE) + } } else if(words_in_text > 2) { if (words_in_match > 2) { - if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start & txt_word3_start == match_word3_start) { - keep = TRUE } - } else if (txt_word1_start == match_word1_start & txt_word2_start == match_word2_start) { - keep = TRUE } - } - - if(keep == TRUE) { - - return(accepted_list[i]) - + if ( + txt_word1_start == match_word1_start & + txt_word2_start == match_word2_start & + txt_word3_start == match_word3_start + ) { + return(TRUE) + } + } else if ( + txt_word1_start == match_word1_start & + txt_word2_start == match_word2_start + ) { + return(TRUE)} } - return(NA) + return(FALSE) } - return(NA) + + j <- purrr::map_lgl(potential_matches, check_match) + + if(!any(j)) return(NA) + + return(potential_matches[j]) } + diff --git a/R/load_taxonomic_resources.R b/R/load_taxonomic_resources.R index ff77309d..91b2acde 100644 --- a/R/load_taxonomic_resources.R +++ b/R/load_taxonomic_resources.R @@ -1,42 +1,60 @@ -#' Load taxonomic resources from either stable or current versions of APC and APNI +#' @title Load taxonomic reference lists, APC & APNI +#' +#' @description +#' This function loads two taxonomic datasets for Australia's vascular plants, +#' the APC and APNI, into the global environment. It creates several data frames +#' by filtering and selecting data from the loaded lists. +#' +#' @details +#' - It accesses taxonomic data from a dataset using the provided version number +#' or the default version. +#' - The output is several dataframes that include subsets of the APC/APNI based +#' on taxon rank and taxonomic status. #' -#' This function loads two taxonomic datasets for Australia's vascular plants, the APC and APNI, into the global environment. -#' It accesses taxonomic data from a dataset using the provided version number or the default version. -#' The function creates several data frames by filtering and selecting data from the loaded lists. -#' -#' @param stable_or_current_data Type of dataset to access. The default is "stable", which loads the -#' dataset from a github archived file. If set to "current", the dataset will be loaded from -#' a URL which is the cutting edge version, but this may change at any time without notice. -#' @param version The version number of the dataset to use. Defaults to the default version. -#' -#' @param reload A logical indicating whether to reload the dataset from the data source. Defaults to FALSE. +#' @param stable_or_current_data Type of dataset to access. +#' The default is "stable", which loads the dataset from a github archived file. +#' If set to "current", the dataset will be loaded from a URL which is the +#' cutting edge version, but this may change at any time without notice. +#' @param version The version number of the dataset to use. +#' Defaults to the default version. +#' +#' @param quiet A logical indicating whether to print status of loading to screen. +#' Defaults to FALSE. #' #' @return The taxonomic resources data loaded into the global environment. #' @export #' #' @examples -#' \donttest{load_taxonomic_resources(stable_or_current_data="stable",version="0.0.2.9000")} +#' \donttest{ +#' load_taxonomic_resources(stable_or_current_data="stable", +#' version="0.0.2.9000")} #' -#' @importFrom dplyr filter select mutate distinct arrange -#' @importFrom crayon red load_taxonomic_resources <- function(stable_or_current_data = "stable", version = default_version(), - reload = FALSE) { - message("Loading resources...", appendLF = FALSE) - on.exit(message("...done")) + quiet = FALSE) { + + taxonomic_resources <- dataset_access_function( version = version, path = tools::R_user_dir("APCalign"), type = stable_or_current_data ) - + + + total_steps <- 3 # Define how many steps you expect in the function + pb <- utils::txtProgressBar(min = 0, max = total_steps, style = 2) + if(!quiet){ + message("Loading resources into memory...") + utils::setTxtProgressBar(pb, 1) + } if(is.null(taxonomic_resources)) { return(NULL) } + # Give list names names(taxonomic_resources) <- c("APC", "APNI") @@ -44,62 +62,40 @@ load_taxonomic_resources <- ### Note: Use `zzzz zzzz` because the fuzzy matching algorithm can't handles NA's zzz <- "zzzz zzzz" + column_rename <- + c( + taxon_ID = "taxonID", + taxon_rank = "taxonRank", + name_type = "nameType", + taxonomic_status = "taxonomicStatus", + pro_parte = "proParte", + scientific_name = "scientificName", + scientific_name_ID = "scientificNameID", + accepted_name_usage_ID = "acceptedNameUsageID", + accepted_name_usage = "acceptedNameUsage", + canonical_name = "canonicalName", + scientific_name_authorship = "scientificNameAuthorship", + taxon_rank_sort_order = "taxonRankSortOrder", + taxon_remarks = "taxonRemarks", + taxon_distribution = "taxonDistribution", + higher_classification = "higherClassification", + nomenclatural_code = "nomenclaturalCode", + dataset_name = "datasetName", + name_element = "nameElement" + ) + taxonomic_resources$APC <- taxonomic_resources$APC %>% - rename( - taxon_ID = .data$taxonID, - taxon_rank = .data$taxonRank, - name_type = .data$nameType, - taxonomic_status = .data$taxonomicStatus, - pro_parte = .data$proParte, - scientific_name = .data$scientificName, - scientific_name_ID = .data$scientificNameID, - accepted_name_usage_ID = .data$acceptedNameUsageID, - accepted_name_usage = .data$acceptedNameUsage, - canonical_name = .data$canonicalName, - scientific_name_authorship = .data$scientificNameAuthorship, - taxon_rank_sort_order = .data$taxonRankSortOrder, - taxon_remarks = .data$taxonRemarks, - taxon_distribution = .data$taxonDistribution, - higher_classification = .data$higherClassification, - nomenclatural_code = .data$nomenclaturalCode, - dataset_name = .data$datasetName - ) %>% - mutate( + dplyr::rename(dplyr::any_of(column_rename)) %>% + dplyr::mutate( genus = extract_genus(canonical_name), - taxon_rank = stringr::str_to_lower(taxon_rank), - taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"), - taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"), - taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"), - taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"), - taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"), - taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"), - taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section") + taxon_rank = standardise_taxon_rank(taxon_rank) ) taxonomic_resources$APNI <- taxonomic_resources$APNI %>% - rename( - name_type = .data$nameType, - taxonomic_status = .data$taxonomicStatus, - taxon_rank = .data$taxonRank, - scientific_name = .data$scientificName, - scientific_name_ID = .data$scientificNameID, - canonical_name = .data$canonicalName, - scientific_name_authorship = .data$scientificNameAuthorship, - taxon_rank_sort_order = .data$taxonRankSortOrder, - nomenclatural_code = .data$nomenclaturalCode, - dataset_name = .data$datasetName, - name_element = .data$nameElement - ) %>% - mutate( + dplyr::rename(dplyr::any_of(column_rename)) %>% + dplyr::mutate( genus = extract_genus(canonical_name), - taxon_rank = stringr::str_to_lower(taxon_rank), - taxon_rank = stringr::str_replace(taxon_rank, "regnum", "kingdom"), - taxon_rank = stringr::str_replace(taxon_rank, "classis", "class"), - taxon_rank = stringr::str_replace(taxon_rank, "ordo", "order"), - taxon_rank = stringr::str_replace(taxon_rank, "familia", "family"), - taxon_rank = stringr::str_replace(taxon_rank, "varietas", "variety"), - taxon_rank = stringr::str_replace(taxon_rank, "forma", "form"), - taxon_rank = stringr::str_replace(taxon_rank, "sectio", "section") + taxon_rank = standardise_taxon_rank(taxon_rank) ) APC_tmp <- @@ -119,21 +115,24 @@ load_taxonomic_resources <- genus ) %>% dplyr::mutate( - # strip_names removes punctuation and filler words associated with infraspecific taxa (subsp, var, f, ser) + ## strip_names removes punctuation and filler words associated with + ## infraspecific taxa (subsp, var, f, ser) stripped_canonical = strip_names(canonical_name), - ## strip_names2 removes punctuation, filler words associated with infraspecific taxa (subsp, var, f, ser), and filler words associated with species name cases (x, sp) - ## strip_names2 is essential for the matches involving 2 or 3 words, since you want those words to not count filler words - stripped_canonical2 = strip_names_2(canonical_name), + ## strip_names_extra removes extra filler words associated with + ## species name cases (x, sp) + ## strip_names_extra is essential for the matches involving 2 or 3 words, + ## since you want those words to not count filler words + stripped_canonical2 = strip_names_extra(stripped_canonical), stripped_scientific = strip_names(scientific_name), binomial = ifelse( taxon_rank == "species", - stringr::word(stripped_canonical2, start = 1, end = 2), + word(stripped_canonical2, start = 1, end = 2), zzz ), binomial = ifelse(is.na(binomial), zzz, binomial), binomial = base::replace(binomial, duplicated(binomial), zzz), genus = extract_genus(stripped_canonical), - trinomial = stringr::word(stripped_canonical2, start = 1, end = 3), + trinomial = word(stripped_canonical2, start = 1, end = 3), trinomial = ifelse(is.na(trinomial), zzz, trinomial), trinomial = base::replace(trinomial, duplicated(trinomial), zzz), ) %>% @@ -149,6 +148,8 @@ load_taxonomic_resources <- dplyr::filter(taxonomic_status != "accepted") %>% dplyr::mutate(taxonomic_dataset = "APC") + + if(!quiet) utils::setTxtProgressBar(pb, 2) # Repeated from above - bionomial, tronomials etc taxonomic_resources[["APNI names"]] <- taxonomic_resources$APNI %>% @@ -163,15 +164,15 @@ load_taxonomic_resources <- dplyr::mutate( taxonomic_status = "unplaced for APC", stripped_canonical = strip_names(canonical_name), - stripped_canonical2 = strip_names_2(canonical_name), + stripped_canonical2 = strip_names_extra(stripped_canonical), stripped_scientific = strip_names(scientific_name), binomial = ifelse( taxon_rank == "species", - stringr::word(stripped_canonical2, start = 1, end = 2), + word(stripped_canonical2, start = 1, end = 2), "zzzz zzzz" ), binomial = ifelse(is.na(binomial), "zzzz zzzz", binomial), - trinomial = stringr::word(stripped_canonical2, start = 1, end = 3), + trinomial = word(stripped_canonical2, start = 1, end = 3), trinomial = ifelse(is.na(trinomial), "zzzz zzzz", trinomial), trinomial = base::replace(trinomial, duplicated(trinomial), "zzzz zzzz"), genus = extract_genus(stripped_canonical), @@ -195,7 +196,7 @@ load_taxonomic_resources <- genus ) %>% dplyr::filter(taxon_rank %in% c("genus"), taxonomic_status == "accepted") %>% - dplyr::filter(!stringr::str_detect(stringr::word(genus, 1), "aceae$")) %>% + dplyr::filter(!stringr::str_detect(genus, "aceae$")) %>% dplyr::mutate(taxonomic_dataset = "APC") taxonomic_resources[["genera_synonym"]] <- @@ -214,10 +215,11 @@ load_taxonomic_resources <- ) %>% dplyr::filter(taxon_rank %in% c("genus")) %>% dplyr::filter(!canonical_name %in% taxonomic_resources$genera_accepted$canonical_name) %>% - dplyr::filter(!stringr::str_detect(stringr::word(genus, 1), "aceae$")) %>% + dplyr::filter(!stringr::str_detect(genus, "aceae$")) %>% dplyr::mutate(taxonomic_dataset = "APC") %>% dplyr::distinct(canonical_name, .keep_all = TRUE) + if(!quiet) utils::setTxtProgressBar(pb, 3) taxonomic_resources[["genera_APNI"]] <- taxonomic_resources$APNI %>% dplyr::select( @@ -231,7 +233,7 @@ load_taxonomic_resources <- ) %>% dplyr::filter(taxon_rank %in% c("genus")) %>% dplyr::filter(!canonical_name %in% taxonomic_resources$APC$canonical_name) %>% - dplyr::filter(!stringr::str_detect(stringr::word(genus, 1), "aceae$")) %>% + dplyr::filter(!stringr::str_detect(genus, "aceae$")) %>% dplyr::mutate(taxonomic_dataset = "APNI") %>% dplyr::distinct(canonical_name, .keep_all = TRUE) @@ -242,7 +244,7 @@ load_taxonomic_resources <- taxonomic_resources$genera_APNI ) %>% dplyr::mutate( - cleaned_name = stringr::word(accepted_name_usage, 1), + cleaned_name = word(accepted_name_usage, 1), cleaned_name = ifelse(is.na(cleaned_name), canonical_name, cleaned_name) ) %>% dplyr::distinct(cleaned_name, canonical_name, scientific_name, .keep_all = TRUE) @@ -250,14 +252,35 @@ load_taxonomic_resources <- taxonomic_resources[["family_accepted"]] <- taxonomic_resources$APC %>% dplyr::filter(taxon_rank %in% c("family"), taxonomic_status == "accepted") + + taxonomic_resources[["family_synonym"]] <- + taxonomic_resources$APC %>% + dplyr::select( + canonical_name, + accepted_name_usage, + accepted_name_usage_ID, + scientific_name, + taxonomic_status, + taxon_ID, + scientific_name_ID, + name_type, + taxon_rank, + genus + ) %>% + dplyr::filter(taxon_rank %in% c("family"), taxonomic_status != "accepted") %>% + dplyr::mutate(taxonomic_dataset = "APC") %>% + dplyr::distinct(canonical_name, .keep_all = TRUE) + close(pb) + if(!quiet) message("...done") return(taxonomic_resources) } ##' Access Australian Plant Census Dataset ##' -##' This function provides access to the Australian Plant Census dataset containing information -##' about various species. The dataset can be loaded from a github for a stable file or from a URL for the most cutting-edge, but not stable version. +##' This function provides access to the Australian Plant Census dataset +##' about various species. The dataset can be loaded from a github for a stable file or +##' from a URL for the most cutting-edge, but not stable version. ##' ##' @param version Version number. The default is NULL, which will load the most recent ##' version of the dataset on your computer or the most recent version known @@ -268,8 +291,9 @@ load_taxonomic_resources <- ##' delete the persistent data at any time by running `mydata_del(NULL)` (or ##' `mydata_del(NULL, path)` if you use a different path). ##' @param type Type of dataset to access. The default is "stable", which loads the -##' dataset from a github archived file. If set to "current", the dataset will be loaded from -##' a URL which is the cutting edge version, but this may change at any time without notice. +##' dataset from a github archived file. If set to "current", the dataset will be +##' loaded from a URL which is the cutting edge version, but this may change at any time +##' without notice. ##' ##' @examples ##' @@ -303,7 +327,6 @@ dataset_access_function <- tryCatch({ APC <- readr::read_csv( "https://biodiversity.org.au/nsl/services/export/taxonCsv", - n_max = 110000, col_types = readr::cols( .default = readr::col_character(), @@ -317,7 +340,6 @@ dataset_access_function <- APNI <- readr::read_csv( "https://biodiversity.org.au/nsl/services/export/namesCsv", - n_max = 140000, col_types = readr::cols( .default = readr::col_character(), @@ -352,7 +374,7 @@ dataset_access_function <- #' @return A character string representing the default version for stable data. #' #' -#' @noRd +#' @export default_version <- function() { # Check if there is internet connection @@ -442,6 +464,7 @@ dataset_get <- function(version = default_version(), path_to_apni <- file.path(path, paste0("apni", version, ".parquet")) APC <- if (!file.exists(path_to_apc)) { + message("Downloading...") download_and_read_parquet(apc.url, path_to_apc) } else { arrow::read_parquet(path_to_apc) diff --git a/R/match_taxa.R b/R/match_taxa.R index 64f8c1c7..d7050765 100644 --- a/R/match_taxa.R +++ b/R/match_taxa.R @@ -1,20 +1,45 @@ -#' Match taxonomic names to accepted names in list +#' @title Match taxonomic names to names in the APC/APNI #' -#' This function attempts to match input strings to a list of allowable taxonomic names. -#' It cycles through more than 20 different string patterns, sequentially searching for additional match patterns. -#' It identifies string patterns in input names that suggest a name can only be aligned to a genus (hybrids that are not accepted names; graded species; taxa not identified to species). -#' It prioritises matches that do not require fuzzy matching (i.e. synonyms, orthographic variants) over those that do. -#' If prioritises matches to taxa in the APC over names in the APNI. +#' @description +#' This function attempts to match input strings to Australia's reference lists +#' for vascular plants, the APC and APNI. It attempts: +#' 1. perfect matches and fuzzy matches +#' 2. matches to infraspecies, species, genus, and family names +#' 3. matches to the entire input string and subsets there-of +#' 4. searches for string patterns that suggest a specific taxon rank +#' +#' @details +#' - It cycles through more than 20 different string patterns, sequentially +#' searching for additional match patterns. +#' - It identifies string patterns in input names that suggest a name can only be +#' aligned to a genus (hybrids that are not accepted names; graded species; +#' taxa not identified to species). +#' - It prioritises matches that do not require fuzzy matching (i.e. synonyms, +#' orthographic variants) over those that do. +#' - If prioritises matches to taxa in the APC over names in the APNI. #' #' @param taxa The list of taxa requiring checking # -#' @param resources The list(s) of accepted names to check against, loaded through the function `load_taxonomic_resources()` -#' @param fuzzy_abs_dist The number of characters allowed to be different for a fuzzy match. -#' @param fuzzy_rel_dist The proportion of characters allowed to be different for a fuzzy match. -#' @param fuzzy_matches Fuzzy matches are turned on as a default. The relative and absolute distances allowed for fuzzy matches to species and infraspecific taxon names are defined by the parameters `fuzzy_abs_dist` and `fuzzy_rel_dist` -#' @param imprecise_fuzzy_matches Imprecise fuzzy matches are turned off as a default. -#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) are turned off as a default. -#' @param identifier A dataset, location or other identifier, which defaults to NA. +#' @param resources The list(s) of accepted names to check against, +#' loaded through the function `load_taxonomic_resources()` +#' @param fuzzy_abs_dist The number of characters allowed to be different +#' for a fuzzy match. +#' @param fuzzy_rel_dist The proportion of characters allowed to be different +#' for a fuzzy match. +#' @param fuzzy_matches Fuzzy matches are turned on as a default. The relative +#' and absolute distances allowed for fuzzy matches to species and +#' infraspecific taxon names are defined by the parameters +#' `fuzzy_abs_dist` and `fuzzy_rel_dist` +#' @param imprecise_fuzzy_matches Imprecise fuzzy matches uses the fuzzy +#' matching function with lenient levels set (absolute distance of +#' 5 characters; relative distance = 0.25). +#' It offers a way to get a wider range of possible names, possibly +#' corresponding to very distant spelling mistakes. This is FALSE as default +#' and all outputs should be checked as it often makes erroneous matches. +#' @param APNI_matches Name matches to the APNI (Australian Plant Names Index) +#' are turned off as a default. +#' @param identifier A dataset, location or other identifier, +#' which defaults to NA. #' #' @noRd match_taxa <- function( @@ -33,7 +58,8 @@ match_taxa <- function( } - ## A function that specifies particular fuzzy matching conditions (for the function fuzzy_match) when matching is being done at the genus level. + ## A function that specifies particular fuzzy matching conditions (for the + ## function fuzzy_match) when matching is being done at the genus level. if (fuzzy_matches == TRUE) { fuzzy_match_genera <- function(x, y) { purrr::map_chr(x, ~ fuzzy_match(.x, y, 2, 0.35, n_allowed = 1)) @@ -48,7 +74,8 @@ match_taxa <- function( imprecise_fuzzy_abs_dist <- 5 imprecise_fuzzy_rel_dist <- 0.25 - ## override all fuzzy matching parameters with absolute and relative distances of 0 if fuzzy matching is turned off + ## override all fuzzy matching parameters with absolute and + ## relative distances of 0 if fuzzy matching is turned off if (fuzzy_matches == FALSE) { fuzzy_abs_dist <- 0 fuzzy_rel_dist <- 0 @@ -56,15 +83,17 @@ match_taxa <- function( imprecise_fuzzy_rel_dist <- 0 } - ## remove APNI-listed genera from resources if APNI matches are turned off (the default) + ## remove APNI-listed genera from resources if APNI matches are turned off + ##(the default) if (APNI_matches == TRUE) { resources$genera_all2 <- resources$genera_all } else { - resources$genera_all2 <- resources$genera_all %>% filter(taxonomic_dataset != "APNI") + resources$genera_all2 <- resources$genera_all %>% dplyr::filter(taxonomic_dataset != "APNI") } ## Repeatedly used identifier strings are created. - ## These identifier strings are added to the aligned names of taxa that do not match to an APC or APNI species or infra-specific level name. + ## These identifier strings are added to the aligned names of taxa that do + ## not match to an APC or APNI species or infra-specific level name. taxa$tocheck <- taxa$tocheck %>% dplyr::mutate( identifier_string = ifelse(is.na(identifier), NA_character_, paste0(" [", identifier, "]")), @@ -81,20 +110,15 @@ match_taxa <- function( stripped_name = stripped_name %>% update_na_with(strip_names(cleaned_name)), stripped_name2 = stripped_name2 %>% - update_na_with(strip_names_2(cleaned_name)), - trinomial = stringr::word(stripped_name2, start = 1, end = 3), - binomial = stringr::word(stripped_name2, start = 1, end = 2), - genus = extract_genus(original_name), - fuzzy_match_genus = - fuzzy_match_genera(genus, resources$genera_accepted$genus), - fuzzy_match_genus_synonym = - fuzzy_match_genera(genus, resources$genera_synonym$genus), - fuzzy_match_genus_APNI = - fuzzy_match_genera(genus, resources$genera_APNI$genus) + update_na_with(strip_names_extra(stripped_name)), + trinomial = word(stripped_name2, start = 1, end = 3), + binomial = word(stripped_name2, start = 1, end = 2), + genus = extract_genus(original_name) ) ## Taxa that have been checked are moved from `taxa$tocheck` to `taxa$checked` - ## These lines of code are repeated after each matching cycle to progressively move taxa from `tocheck` to `checked` + ## These lines of code are repeated after each matching cycle to + ## progressively move taxa from `tocheck` to `checked` taxa <- redistribute(taxa) if (nrow(taxa$tocheck) == 0) @@ -114,7 +138,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (accepted)`$taxon_rank[ii], aligned_name = resources$`APC list (accepted)`$canonical_name[ii], @@ -145,7 +169,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (known names)`$taxon_rank[ii], aligned_name = resources$`APC list (known names)`$canonical_name[ii], @@ -175,7 +199,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (accepted)`$taxon_rank[ii], aligned_name = resources$`APC list (accepted)`$canonical_name[ii], @@ -205,7 +229,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (known names)`$taxon_rank[ii], aligned_name = resources$`APC list (known names)`$canonical_name[ii], @@ -230,7 +254,7 @@ match_taxa <- function( i <- stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") & taxa$tocheck$genus %in% resources$genera_all2$genus & - stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.") + word(taxa$tocheck$cleaned_name, 2) %in% c("sp.") ii <- match( @@ -239,7 +263,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = resources$genera_all2$taxonomic_dataset[ii], taxon_rank = "genus", aligned_name_tmp = paste0(resources$genera_all2$genus[ii], " sp."), @@ -263,6 +287,18 @@ match_taxa <- function( if (nrow(taxa$tocheck) == 0) return(taxa) + # Add some extra columns - checking for fuzzy matches in genus and family + # Not including this above, as fuzzy matching is slow + taxa$tocheck <- taxa$tocheck %>% + dplyr::mutate( + fuzzy_match_genus = + fuzzy_match_genera(genus, resources$genera_accepted$genus), + fuzzy_match_genus_synonym = + fuzzy_match_genera(genus, resources$genera_synonym$genus), + fuzzy_match_genus_APNI = + fuzzy_match_genera(genus, resources$genera_APNI$genus) + ) + # match_02b: Genus-level resolution # Fuzzy matches of APC accepted genera for names where the final "word" is `sp` or `spp` and # there isn't an exact match to an APC accepted genus name @@ -271,7 +307,7 @@ match_taxa <- function( i <- stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") & taxa$tocheck$fuzzy_match_genus %in% resources$genera_accepted$genus & - stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.") + word(taxa$tocheck$cleaned_name, 2) %in% c("sp.") ii <- match( @@ -280,7 +316,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = resources$genera_accepted$taxonomic_dataset[ii], taxon_rank = "genus", aligned_name_tmp = @@ -310,7 +346,7 @@ match_taxa <- function( i <- stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") & taxa$tocheck$fuzzy_match_genus_synonym %in% resources$genera_synonym$genus & - stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.") + word(taxa$tocheck$cleaned_name, 2) %in% c("sp.") ii <- match( @@ -319,7 +355,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = resources$genera_synonym$taxonomic_dataset[ii], taxon_rank = "genus", aligned_name_tmp = paste0(resources$genera_synonym$genus[ii], " sp."), @@ -347,10 +383,10 @@ match_taxa <- function( i <- stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") & taxa$tocheck$genus %in% resources$family_accepted$canonical_name & - stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.") + word(taxa$tocheck$cleaned_name, 2) %in% c("sp.") taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "family", aligned_name = ifelse(is.na(identifier_string), @@ -385,7 +421,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = resources$genera_all2$taxonomic_dataset[ii], taxon_rank = "genus", aligned_name_tmp = paste0(resources$genera_all2$genus[ii], " sp. [", cleaned_name), @@ -419,7 +455,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus %in% resources$genera_accepted$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus, " sp. [", cleaned_name), @@ -451,7 +487,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus_synonym %in% resources$genera_synonym$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus_synonym, " sp. [", cleaned_name), @@ -483,7 +519,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus_APNI %in% resources$genera_APNI$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus_APNI, " sp. [", cleaned_name), @@ -515,10 +551,10 @@ match_taxa <- function( !taxa$tocheck$fuzzy_match_genus %in% resources$genera_all2$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = NA_character_, - taxon_rank = "genus", - aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " sp. [", cleaned_name), + taxon_rank = NA, + aligned_name_tmp = paste0(word(cleaned_name,1), " sp. [", cleaned_name), aligned_name = NA, aligned_reason = paste0( "Taxon name includes '--' (double dash) indicating an intergrade between two taxa, but exact and fuzzy matches fail to align to a genus in the APC or APNI (", @@ -557,7 +593,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = resources$genera_all2$taxonomic_dataset[ii], taxon_rank = "genus", aligned_name_tmp = paste0(resources$genera_all2$genus[ii], " sp. [", cleaned_name), @@ -598,7 +634,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus %in% resources$genera_accepted$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus, " sp. [", cleaned_name), @@ -637,7 +673,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus_synonym %in% resources$genera_synonym$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus_synonym, " sp. [", cleaned_name), @@ -676,7 +712,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus_APNI %in% resources$genera_APNI$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus_APNI, " sp. [", cleaned_name), @@ -717,13 +753,13 @@ match_taxa <- function( !taxa$tocheck$fuzzy_match_genus %in% resources$genera_all2$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = NA_character_, - taxon_rank = "genus", - aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " sp. [", cleaned_name), + taxon_rank = NA, + aligned_name_tmp = paste0(word(cleaned_name,1), " sp. [", cleaned_name), aligned_name = NA, aligned_reason = paste0( - "Taxon name includes '/' (slash) indicating an uncertain species identification but an accepted genus and taxon can only be aligned to genus-rank. Exact and fuzzy matches fail to align to a genus in the APC or APNI (", + "Taxon name includes '/' (slash) indicating an uncertain species identification but exact and fuzzy matches fail to align to a genus in the APC or APNI (", Sys.Date(), ")" ), @@ -740,7 +776,7 @@ match_taxa <- function( # match_05a: fuzzy match to APC-accepted canonical name # Fuzzy match of taxon name to an APC-accepted canonical name, once filler words and punctuation are removed. - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APC[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -761,7 +797,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (accepted)`$taxon_rank[ii], aligned_name = resources$`APC list (accepted)`$canonical_name[ii], @@ -781,7 +817,7 @@ match_taxa <- function( # match_05b: fuzzy match to APC-known canonical name # Fuzzy match of taxon name to an APC-known canonical name, once filler words and punctuation are removed. - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APC_synonym[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -802,7 +838,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (known names)`$taxon_rank[ii], aligned_name = resources$`APC list (known names)`$canonical_name[ii], @@ -833,7 +869,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = resources$`APNI names`$taxon_rank[ii], aligned_name = resources$`APNI names`$canonical_name[ii], @@ -863,7 +899,7 @@ match_taxa <- function( i <- ( stringr::str_detect(taxa$tocheck$cleaned_name, "[Aa]ff[\\.\\s]") | - stringr::str_detect(taxa$tocheck$cleaned_name, " affinis ") | + stringr::str_detect(taxa$tocheck$cleaned_name, " affinis[\\s|$]") | stringr::str_detect(taxa$tocheck$cleaned_name, " cf[\\.\\s]") ) & taxa$tocheck$genus %in% resources$genera_all2$genus @@ -875,7 +911,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = resources$genera_all2$taxonomic_dataset[ii], taxon_rank = "genus", aligned_name_tmp = paste0(resources$genera_all2$genus[ii], " sp. [", cleaned_name), @@ -913,7 +949,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus %in% resources$genera_accepted$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus, " sp. [", cleaned_name), @@ -949,7 +985,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus_synonym %in% resources$genera_synonym$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus_synonym, " sp. [", cleaned_name), @@ -986,7 +1022,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus_APNI %in% resources$genera_APNI$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus_APNI, " sp. [", cleaned_name), @@ -1024,13 +1060,13 @@ match_taxa <- function( !taxa$tocheck$fuzzy_match_genus %in% resources$genera_all2$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = NA_character_, - taxon_rank = "genus", - aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " sp. [", cleaned_name), + taxon_rank = NA, + aligned_name_tmp = paste0(word(cleaned_name,1), " sp. [", cleaned_name), aligned_name = NA, aligned_reason = paste0( - "Taxon name includes 'affinis' or 'aff' indicating an unknown taxon that bears an affinity to a different taxon in the same genus and taxon can only be aligned to genus-rank. Exact and fuzzy matches fail to align to a genus in the APC or APNI ", + "Taxon name includes 'affinis' or 'aff' indicating an unknown taxon that bears an affinity to a different taxon in the same genus, but exact and fuzzy matches fail to align to a genus in the APC or APNI (", Sys.Date(), ")" ), @@ -1048,7 +1084,7 @@ match_taxa <- function( # For imprecise fuzzy matches, the taxon name can differ from the `APC-accepted` names by 5 characters & up to 25% of the string length. # These matches require individual review and are turned off as a default. if (imprecise_fuzzy_matches == TRUE) { - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APC_imprecise[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -1070,7 +1106,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (accepted)`$taxon_rank[ii], aligned_name = resources$`APC list (accepted)`$canonical_name[ii], @@ -1094,7 +1130,7 @@ match_taxa <- function( # For imprecise fuzzy matches, the taxon name can differ from the `APC -known` names by 5 characters & up to 25% of the string length. # These matches require individual review and are turned off as a default. if (imprecise_fuzzy_matches == TRUE) { - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APC_synonym_imprecise[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -1116,7 +1152,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (known names)`$taxon_rank[ii], aligned_name = resources$`APC list (known names)`$canonical_name[ii], @@ -1152,7 +1188,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = resources$genera_all2$taxonomic_dataset[ii], taxon_rank = "genus", aligned_name_tmp = paste0(resources$genera_all2$genus[ii], " x [", cleaned_name), @@ -1187,7 +1223,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus %in% resources$genera_accepted$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus, " x [", cleaned_name), @@ -1220,7 +1256,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus_synonym %in% resources$genera_synonym$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus_synonym, " x [", cleaned_name), @@ -1254,7 +1290,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus_APNI %in% resources$genera_APNI$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus_APNI, " x [", cleaned_name), @@ -1288,13 +1324,13 @@ match_taxa <- function( !taxa$tocheck$fuzzy_match_genus %in% resources$genera_all2$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = NA_character_, - taxon_rank = "genus", - aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " x [", cleaned_name), + taxon_rank = NA, + aligned_name_tmp = paste0(word(cleaned_name,1), " x [", cleaned_name), aligned_name = NA, aligned_reason = paste0( - "Taxon name includes ' x ' indicating a hybrid taxon and taxon can only be aligned to genus-rank. Exact and fuzzy matches fail to align to a genus in the APC or APNI (", + "Taxon name includes ' x ' indicating a hybrid, but exact and fuzzy matches fail to align to a genus in the APC or APNI (", Sys.Date(), ")" ), @@ -1323,7 +1359,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (accepted)`$taxon_rank[ii], aligned_name = resources$`APC list (accepted)`$canonical_name[ii], @@ -1357,7 +1393,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (known names)`$taxon_rank[ii], aligned_name = resources$`APC list (known names)`$canonical_name[ii], @@ -1381,7 +1417,7 @@ match_taxa <- function( # sometimes the submitted taxon name is a valid trinomial + notes and # such names will only be aligned by matches considering only the first three words of the stripped name. # This match also does a good job aligning and correcting syntax of phrase names - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { if (!is.na(taxa$tocheck$trinomial[i])) { taxa$tocheck$fuzzy_match_trinomial[i] <- fuzzy_match( @@ -1404,7 +1440,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (accepted)`$taxon_rank[ii], aligned_name = resources$`APC list (accepted)`$canonical_name[ii], @@ -1428,7 +1464,7 @@ match_taxa <- function( # sometimes the submitted taxon name is a valid trinomial + notes and # such names will only be aligned by matches considering only the first three words of the stripped name. # This match also does a good job aligning and correcting syntax of phrase names - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { if (!is.na(taxa$tocheck$trinomial[i])) { taxa$tocheck$fuzzy_match_trinomial_synonym[i] <- fuzzy_match( @@ -1451,7 +1487,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (known names)`$taxon_rank[ii], aligned_name = resources$`APC list (known names)`$canonical_name[ii], @@ -1487,7 +1523,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (accepted)`$taxon_rank[ii], aligned_name = resources$`APC list (accepted)`$canonical_name[ii], @@ -1522,7 +1558,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (known names)`$taxon_rank[ii], aligned_name = resources$`APC list (known names)`$canonical_name[ii], @@ -1547,7 +1583,7 @@ match_taxa <- function( # or a valid binomial + invalid infraspecific epithet. # Such names will only be aligned by matches considering only the first two words of the stripped name. # This match also does a good job aligning and correcting syntax of phrase names. - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { if (!is.na(taxa$tocheck$binomial[i]) & is.na(taxa$tocheck$fuzzy_match_binomial[i])) { taxa$tocheck$fuzzy_match_binomial[i] <- @@ -1572,7 +1608,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (accepted)`$taxon_rank[ii], aligned_name = resources$`APC list (accepted)`$canonical_name[ii], @@ -1597,7 +1633,7 @@ match_taxa <- function( # or a valid binomial + invalid infraspecific epithet. # Such names will only be aligned by matches considering only the first two words of the stripped name. # This match also does a good job aligning and correcting syntax of phrase names. - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { if (!is.na(taxa$tocheck$binomial[i]) & is.na(taxa$tocheck$fuzzy_match_binomial_APC_synonym[i])) { taxa$tocheck$fuzzy_match_binomial_APC_synonym[i] <- @@ -1622,7 +1658,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = resources$`APC list (known names)`$taxon_rank[ii], aligned_name = resources$`APC list (known names)`$canonical_name[ii], @@ -1648,7 +1684,7 @@ match_taxa <- function( # to avoid incorrectly aligning an APC accepted/known taxa to an APNI name. # This is especially true to accurately align phrase names. if (APNI_matches == TRUE) { - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APNI[i] <- fuzzy_match( txt = taxa$tocheck$stripped_name[i], @@ -1670,7 +1706,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = resources$`APNI names`$taxon_rank[ii], aligned_name = resources$`APNI names`$canonical_name[ii], @@ -1695,7 +1731,7 @@ match_taxa <- function( # These matches require individual review and are turned off as a default. if (APNI_matches == TRUE & imprecise_fuzzy_matches == TRUE) { - for (i in 1:nrow(taxa$tocheck)) { + for (i in seq_len(nrow(taxa$tocheck))) { taxa$tocheck$fuzzy_match_cleaned_APNI_imprecise[i] <- fuzzy_match( txt = taxa$tocheck$cleaned_name[i], @@ -1717,7 +1753,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = resources$`APNI names`$taxon_rank[ii], aligned_name = resources$`APNI names`$canonical_name[ii], @@ -1753,7 +1789,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = resources$`APNI names`$taxon_rank[ii], aligned_name = resources$`APNI names`$canonical_name[ii], @@ -1790,7 +1826,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = resources$`APNI names`$taxon_rank[ii], aligned_name = resources$`APNI names`$canonical_name[ii], @@ -1822,7 +1858,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(resources$genera_accepted$genus[ii], " sp. [", cleaned_name), @@ -1857,7 +1893,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(resources$genera_synonym$genus[ii], " sp. [", cleaned_name), @@ -1893,7 +1929,7 @@ match_taxa <- function( ) taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APNI", taxon_rank = "genus", aligned_name_tmp = paste0(resources$genera_APNI$genus[ii], " sp. [", cleaned_name), @@ -1921,11 +1957,11 @@ match_taxa <- function( # The 'taxon name' is then reformatted as `family sp.` with the original name in square brackets. i <- - stringr::str_detect(stringr::word(taxa$tocheck$cleaned_name, 1), "aceae$") & + stringr::str_detect(word(taxa$tocheck$cleaned_name, 1), "aceae$") & taxa$tocheck$genus %in% resources$family_accepted$canonical_name taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "family", aligned_name_tmp = paste0(genus, " sp. [", cleaned_name), @@ -1943,11 +1979,42 @@ match_taxa <- function( alignment_code = "match_12d_family_exact_accepted" ) + taxa <- redistribute(taxa) + if (nrow(taxa$tocheck) == 0) + return(taxa) + + # match_12e: family-level synonym alignment + # Toward the end of the alignment function, see if first word of unmatched taxa is an APC-known family. + # The 'taxon name' is then reformatted as `family sp.` with the original name in square brackets. + + i <- + stringr::str_detect(word(taxa$tocheck$cleaned_name, 1), "ae$") & + taxa$tocheck$genus %in% resources$family_synonym$canonical_name + + taxa$tocheck[i,] <- taxa$tocheck[i,] %>% + dplyr::mutate( + taxonomic_dataset = "APC", + taxon_rank = "family", + aligned_name_tmp = paste0(genus, " sp. [", cleaned_name), + aligned_name = ifelse(is.na(identifier_string2), + paste0(aligned_name_tmp, "]"), + paste0(aligned_name_tmp, identifier_string2, "]") + ), + aligned_reason = paste0( + "Exact match of the first word of the taxon name to an APC-synonymous family (", + Sys.Date(), + ")" + ), + known = TRUE, + checked = TRUE, + alignment_code = "match_12e_family_exact_synonym" + ) + taxa <- redistribute(taxa) if (nrow(taxa$tocheck) == 0) return(taxa) - # match_12e: genus-level fuzzy alignment + # match_12f: genus-level fuzzy alignment # The final alignment step is to see if a fuzzy match can be made for the first word of unmatched taxa to an APC-accepted genus . # The 'taxon name' is then reformatted as `genus sp.` with the original name in square brackets. @@ -1955,7 +2022,7 @@ match_taxa <- function( taxa$tocheck$fuzzy_match_genus %in% resources$genera_accepted$genus taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus, " sp. [", cleaned_name), @@ -1970,22 +2037,22 @@ match_taxa <- function( ), known = TRUE, checked = TRUE, - alignment_code = "match_12e_genus_fuzzy_accepted" + alignment_code = "match_12f_genus_fuzzy_accepted" ) taxa <- redistribute(taxa) if (nrow(taxa$tocheck) == 0) return(taxa) - - # match_12f: genus-level fuzzy alignment - # The final alignment step is to see if a fuzzy match can be made for the first word of unmatched taxa to an APC-known genus . + + # match_12g: genus-level fuzzy alignment of synonyms + # Another alignment step is to see if a fuzzy match can be made for the first word of unmatched taxa to an APC-known genus. # The 'taxon name' is then reformatted as `genus sp.` with the original name in square brackets. - + i <- taxa$tocheck$fuzzy_match_genus_synonym %in% resources$genera_synonym$genus - + taxa$tocheck[i,] <- taxa$tocheck[i,] %>% - mutate( + dplyr::mutate( taxonomic_dataset = "APC", taxon_rank = "genus", aligned_name_tmp = paste0(fuzzy_match_genus_synonym, " sp. [", cleaned_name), @@ -2000,11 +2067,80 @@ match_taxa <- function( ), known = TRUE, checked = TRUE, - alignment_code = "match_12f_genus_fuzzy_synonym" + alignment_code = "match_12g_genus_fuzzy_synonym" + ) + + taxa <- redistribute(taxa) + if (nrow(taxa$tocheck) == 0) + return(taxa) + + # match_12h: family-level fuzzy alignment + # Alignment step is to see if a fuzzy match can be made for the first word of unmatched taxa to an APC-accepted family. + # The 'taxon name' is then reformatted as `genus sp.` with the original name in square brackets. + + # Add some extra columns - checking for fuzzy matches in family + # Not including this above, as fuzzy matching is slow + taxa$tocheck <- taxa$tocheck %>% + dplyr::mutate( + fuzzy_match_family = + fuzzy_match_genera(genus, resources$family_accepted$canonical_name), + fuzzy_match_family_synonym = + fuzzy_match_genera(genus, resources$family_synonym$canonical_name) + ) + + i <- + taxa$tocheck$fuzzy_match_family %in% resources$family_accepted$canonical_name + + taxa$tocheck[i,] <- taxa$tocheck[i,] %>% + dplyr::mutate( + taxonomic_dataset = "APC", + taxon_rank = "family", + aligned_name_tmp = paste0(fuzzy_match_family, " sp. [", cleaned_name), + aligned_name = ifelse(is.na(identifier_string2), + paste0(aligned_name_tmp, "]"), + paste0(aligned_name_tmp, identifier_string2, "]") + ), + aligned_reason = paste0( + "Fuzzy match of the first word of the taxon name to an APC-accepted family (", + Sys.Date(), + ")" + ), + known = TRUE, + checked = TRUE, + alignment_code = "match_12h_family_fuzzy_accepted" ) taxa <- redistribute(taxa) + if (nrow(taxa$tocheck) == 0) + return(taxa) + + # match_12i: family-level fuzzy alignment for synonyms + # The final alignment step is to see if a fuzzy match can be made for the first word of unmatched taxa to an APC-synonymous family. + # The 'taxon name' is then reformatted as `genus sp.` with the original name in square brackets. + + i <- + taxa$tocheck$fuzzy_match_family_synonym %in% resources$family_synonym$canonical_name + taxa$tocheck[i,] <- taxa$tocheck[i,] %>% + dplyr::mutate( + taxonomic_dataset = "APC", + taxon_rank = "family", + aligned_name_tmp = paste0(fuzzy_match_family_synonym, " sp. [", cleaned_name), + aligned_name = ifelse(is.na(identifier_string2), + paste0(aligned_name_tmp, "]"), + paste0(aligned_name_tmp, identifier_string2, "]") + ), + aligned_reason = paste0( + "Fuzzy match of the first word of the taxon name to an APC-synonymous family (", + Sys.Date(), + ")" + ), + known = TRUE, + checked = TRUE, + alignment_code = "match_12i_family_fuzzy_synonym" + ) + + taxa <- redistribute(taxa) if (nrow(taxa$tocheck) == 0) return(taxa) diff --git a/R/native_anywhere_in_australia.R b/R/native_anywhere_in_australia.R index 05767c8b..cd7aba60 100644 --- a/R/native_anywhere_in_australia.R +++ b/R/native_anywhere_in_australia.R @@ -1,17 +1,28 @@ -#' For a vector of taxon names in to the APC, check if the species are native anywhere in Australia +#' @title Native anywhere in Australia +#' +#' @description +#' This function checks which species from a list is thought to be native anywhere in +#' Australia according to the APC. #' -#' This function checks if the given species is native anywhere in Australia according to the APC. -#' Note that this will not detect within-Australia introductions, e.g. if a species is from Western Australia and is invasive on the east coast. -#' And recent invasions are unlikely to be documented yet in APC. -#' For the complete matrix of species by states that also represents within-Australia invasions, -#' use \link{create_species_state_origin_matrix}. For spelling checks and taxonomy updates please see \link{create_taxonomic_update_lookup}. +#' @details +#' Important caveats: +#' - This function will not detect within-Australia introductions, +#' e.g. if a species is from Western Australia and is invasive on the east coast. +#' - Very recent invasions are unlikely to be documented yet in APC. +#' - Ideally check spelling and taxonomy updates first via +#' \link{create_taxonomic_update_lookup}. +#' - For the complete matrix of species by states that also represents +#' within-Australia invasions, use \link{create_species_state_origin_matrix}. #' #' @family diversity methods #' @param species A character string typically representing the binomial for the species. #' @param resources An optional list of taxonomic resources to use for the lookup. -#' If not provided, the function will load default taxonomic resources using the `load_taxonomic_resources()` function. -#' @return A tibble with two columns: `species`, which is the same as the unique values of the input `species`, -#' and `native_anywhere_in_aus`, a vector indicating whether each species is native anywhere in Australia, introduced by humans from elsewhere, or unknown with respect to the APC resource. +#' If not provided, the function will load default taxonomic resources using the +#' `load_taxonomic_resources()` function. +#' @return A tibble with two columns: `species`, which is the same as the unique values of +#' the input `species`, and `native_anywhere_in_aus`, a vector indicating whether each +#' species is native anywhere in Australia, introduced by humans from elsewhere, or +#' unknown with respect to the APC resource. #' @export #' @examples #' \donttest{native_anywhere_in_australia(c("Eucalyptus globulus","Pinus radiata","Banksis notaspecies"))} @@ -36,7 +47,7 @@ native_anywhere_in_australia <- function(species, resources = load_taxonomic_res fulllist <- species %in% full_lookup$species # Create output tibble - result <- tibble( + result <- dplyr::tibble( species = species, native_anywhere_in_aus = dplyr::case_when( natives & fulllist ~ "native", diff --git a/R/reexports.R b/R/reexports.R new file mode 100644 index 00000000..d9231565 --- /dev/null +++ b/R/reexports.R @@ -0,0 +1,2 @@ +#' @importFrom dplyr %>% +dplyr::`%>%` diff --git a/R/release.R b/R/release.R new file mode 100644 index 00000000..69017010 --- /dev/null +++ b/R/release.R @@ -0,0 +1,74 @@ + #' Download taxonomic resources for GitHub Release +#' +#' @param version_name character string of version name, follow semantic versioning +#' @param path to download parquets to upload +#' @keywords internal +#' @noRd + +download_taxonomic_resources_for_release<- function(version_name = NULL, path = "ignore/"){ + +# TODO: Use gh package to release programmatically +# body <- paste0('{"tag_name":"',version_name,'","target_commitish":"master","name":"',version_name,'","body":"Download of taxonomic resources from APC and APNI as of ',Sys.Date(),'","draft":true,"prerelease":false,"generate_release_notes":false}') +# +# # Creating release via GH API +# gh::gh("POST /repos/{owner}/{repo}/releases", +# owner = "traitecoevo", repo = "APCalign", +# charToRaw(body), +# .send_headers = c( +# Accept = "application/vnd.github.switcheroo-preview+json", +# "Content-Type" = "application/json" +# ) +# ) + +# Download APC + APC <- + readr::read_csv( + "https://biodiversity.org.au/nsl/services/export/taxonCsv", + col_types = + readr::cols( + .default = readr::col_character(), + proParte = readr::col_logical(), + taxonRankSortOrder = readr::col_double(), + created = readr::col_datetime(format = ""), + modified = readr::col_datetime(format = "") + ) + ) + + # Save APC as parquet +arrow::write_parquet(APC, sink = paste0(path,"apc.parquet")) +# Save APC as tar.gz +readr::write_csv(APC, file = paste0(path,"apc.tar.gz")) + +# Download APNI + APNI <- + readr::read_csv( + "https://biodiversity.org.au/nsl/services/export/namesCsv", + col_types = + readr::cols( + .default = readr::col_character(), + autonym = readr::col_logical(), + hybrid = readr::col_logical(), + cultivar = readr::col_logical(), + formula = readr::col_logical(), + scientific = readr::col_logical(), + nomInval = readr::col_logical(), + nomIlleg = readr::col_logical(), + namePublishedInYear = readr::col_double(), + taxonRankSortOrder = readr::col_double(), + created = readr::col_datetime(format = ""), + modified = readr::col_datetime(format = "") + ) + ) + +# Exclude names that are in APC from APNI + APNI_cleaned <- APNI |> + dplyr::filter(!canonicalName %in% APC$canonicalName) + +# Save APNI as parquet +arrow::write_parquet(APNI_cleaned, sink = paste0(path,"apni.parquet")) + +# Save APNI as tar.gz +readr::write_csv(APNI_cleaned, file = paste0(path,"apni.tar.gz")) + +} + diff --git a/R/standardise_names.R b/R/standardise_names.R index 94fd25ec..a619734f 100644 --- a/R/standardise_names.R +++ b/R/standardise_names.R @@ -1,16 +1,26 @@ -#' Standardises taxon names by performing a series of text substitutions to remove common inconsistencies in taxonomic nomenclature. -#' +#' @title Standardise taxon names +#' +#' @description +#' Standardises taxon names by performing a series of text substitutions to +#' remove common inconsistencies in taxonomic nomenclature. +#' #' The function takes a character vector of taxon names as input and -#' returns a character vector of taxon names using standardised taxonomic syntax as output. -#' In particular it standardises taxon rank abbreviations and qualifiers (subsp., var., f.), as people use many variants of these terms. -#' It also standardises or removes a few additional filler words used within taxon names (affinis becomes aff.; s.l. and s.s. are removed). +#' returns a character vector of taxon names using standardised taxonomic syntax +#' as output. +#' +#' @details +#' - It removes stray punctuation at the start and end of a character string. +#' - It standardises unusual characters and symbols to ASCII equivalents. +#' - It standardises taxon rank abbreviations and qualifiers (subsp., var., f.), +#' as people use many variants of these terms. +#' - It standardises or removes a few additional filler words used within +#' taxon names (affinis becomes aff.; s.l. and s.s. are removed). #' #' @param taxon_names A character vector of taxon names that need to be standardised. #' #' @return A character vector of standardised taxon names. #' -#' #' @examples #' standardise_names(c("Quercus suber", #' "Eucalyptus sp.", @@ -26,12 +36,32 @@ standardise_names <- function(taxon_names) { } taxon_names %>% - ## for hybrid markers + ## remove ? throughout + f("\\?", "") %>% + + ## remove all punct and symbols at start of string + ## this combination should catch almost everything + ## it is essential there are no stray characters at the start of strings + ## for fuzzy-matching to work once the reference list is split by first-character + stringr::str_replace("^[~!@#$%^&*()_+-=`;',./<>?:{}|]+", "") %>% + stringr::str_replace("^[:punct:]+", "") %>% + + ## remove * at end of string + f("\\*$", "") %>% + + ## replace hybrid x marker with standard x + ## for certain hybrid x's that aren't dealt with below + f("\u00D7", "x") %>% + + ## hybrid markers and other non-standard characters used are replaced with + ## the standard equivalent (e.g. x, \) stringi::stri_trans_general("Any-Latin; Latin-ASCII") %>% - f("\\*", "x") %>% + + ## add spaces between letters and / + f("([a-zA-Z])/([a-zA-Z])", "\\1 / \\2") %>% ## remove ".." - stringr::str_replace("\\.\\.", "\\.") %>% + f("\\.\\.", "\\.") %>% ## Weird formatting f("[\\n\\t]", " ") %>% @@ -63,7 +93,7 @@ standardise_names <- function(taxon_names) { f("\\saffin(\\s|$)", " aff. ") %>% f("\\saff(\\s|$)", " aff. ") %>% f("\\saffn(\\s|$|\\.)", " aff. ") %>% - f("\\saffinis(\\s|$)", " aff. ") %>% + f("\\saffinis(\\s)", " aff. ") %>% ## f. not forma or form or form. or f f("\\sforma(\\s|$)", " f. ") %>% @@ -90,6 +120,7 @@ standardise_names <- function(taxon_names) { ## standarise "ser" f("\\sser(\\s|\\.\\s)", " ser. ") %>% + f("\\sseries(\\s|\\.\\s)", " ser. ") %>% ## clean white space stringr::str_squish() @@ -107,17 +138,55 @@ standardise_names <- function(taxon_names) { #' @return The genus for a scientific name. #' #' @examples -#' genus = extract_genus(stripped_name) +#' extract_genus(c("Banksia integrifolia", "Acacia longifolia")) #' #' @keywords internal #' @noRd - extract_genus <- function(taxon_name) { - genus <- - ifelse( - stringr::word(taxon_name, 1) %>% stringr::str_to_lower() == "x", - paste(stringr::word(taxon_name, 1) %>% stringr::str_to_lower(), stringr::word(taxon_name, 2) %>% stringr::str_to_sentence()), - stringr::word(taxon_name, 1) %>% stringr::str_to_sentence() - ) + + taxon_name <- standardise_names(taxon_name) + + genus <- stringr::str_split_i(taxon_name, " |\\/", 1) %>% stringr::str_to_sentence() + + # Deal with names that being with x, + # e.g."x Taurodium x toveyanum" or "x Glossadenia tutelata" + i <- !is.na(genus) & genus =="X" + + genus[i] <- + stringr::str_split_i(taxon_name[i], " |\\/", 2) %>% stringr::str_to_sentence() %>% paste("x", .) + genus } + + +#' @title Standardise taxon ranks +#' +#' @description +#' Standardise taxon ranks from Latin into English. +#' +#' @details +#' The function takes a character vector of Latin taxon ranks as input and +#' returns a character vector of taxon ranks using standardised English terms. +#' +#' @param taxon_rank A character vector of Latin taxon ranks. +#' +#' @return A character vector of English taxon ranks. +#' +#' @examples +#' standardise_taxon_rank(c("regnum", "kingdom", "classis", "class")) +#' @export +standardise_taxon_rank <- function(taxon_rank) { + f <- function(x, find, replace) { + gsub(find, replace, x, fixed = TRUE) + } + + taxon_rank %>% + stringr::str_to_lower() %>% + f("regnum", "kingdom") %>% + f("classis", "class") %>% + f("ordo", "order") %>% + f("familia", "family") %>% + f("varietas", "variety") %>% + f("forma", "form") %>% + f("sectio", "section") +} diff --git a/R/state_diversity_counts.R b/R/state_diversity_counts.R index 36c169b7..ad0ce302 100644 --- a/R/state_diversity_counts.R +++ b/R/state_diversity_counts.R @@ -1,14 +1,24 @@ -#' For Australian states and territories, use data from the APC to calculate state-level diversity for native, introduced, and more complicated species origins -#' -#' This function calculates state-level diversity for native, introduced, and more complicated species origins -#' based on the geographic data available in the APC. +#' @title State- and territory-level diversity +#' +#' @description +#' For Australian states and territories, use geographic distribution data from +#' the APC to calculate state-level diversity for native, introduced, +#' and more complicated species origins #' #' @family diversity methods -#' @param state A character string indicating the Australian state or territory to calculate the diversity for. Possible values are "NSW", "NT", "Qld", "WA", "ChI", "SA", "Vic", "Tas", "ACT", "NI", "LHI", "MI", "HI", "MDI", "CoI", "CSI", and "AR". -#' @param resources the taxonomic resources required to make the summary statistics. loading this can be slow, so call load_taxonomic_resources separately to greatly speed this function up and pass the resources in. +#' @param state A character string indicating the Australian state or +#' territory to calculate the diversity for. Possible values are "NSW", "NT", +#' "Qld", "WA", "ChI", "SA", "Vic", "Tas", "ACT", "NI", "LHI", "MI", "HI", +#' "MDI", "CoI", "CSI", and "AR". +#' @param resources the taxonomic resources required to make the summary +#' statistics. loading this can be slow, so call load_taxonomic_resources +#' separately to greatly speed this function up and pass the resources in. #' -#' @return A tibble of diversity counts for the specified state or territory, including native, introduced, and more complicated species origins. -#' The tibble has three columns: "origin" indicating the origin of the species, "state" indicating the Australian state or territory, and "num_species" indicating the number of species for that origin and state. +#' @return A tibble of diversity counts for the specified state or territory, +#' including native, introduced, and more complicated species origins. +#' The tibble has three columns: "origin" indicating the origin of the +#' species, "state" indicating the Australian state or territory, and +#' "num_species" indicating the number of species for that origin and state. #' #' @seealso \code{\link{load_taxonomic_resources}} #' @@ -51,7 +61,7 @@ state_diversity_counts <- function(state, create_species_state_origin_matrix(resources = resources) test2 <- test[test[[state]] != "not present", ] state_table <- table(test2[[state]]) - return(tibble( + return(dplyr::tibble( origin = names(state_table), state = state, num_species = state_table @@ -63,11 +73,11 @@ state_diversity_counts <- function(state, #' @noRd get_apc_genus_family_lookup <- function(resources = load_taxonomic_resources()) { - apc_s <- filter(resources$APC, + apc_s <- dplyr::filter(resources$APC, taxon_rank == "species") - tibble(genus = word(apc_s$scientific_name, 1, 1), + dplyr::tibble(genus = word(apc_s$scientific_name, 1, 1), family = apc_s$family) %>% - distinct() -> lu + dplyr::distinct() -> lu return(lu) } diff --git a/R/strip_names.R b/R/strip_names.R index 488f23f8..bb1a365d 100644 --- a/R/strip_names.R +++ b/R/strip_names.R @@ -1,13 +1,22 @@ -#' Strip taxonomic names of taxon rank abbreviations and qualifiers and special characters +#' @title Strip taxon names +#' +#' @description +#' Strip taxonomic names of taxon rank abbreviations and qualifiers +#' and special characters #' -#' Given a vector of taxonomic names, this function removes subtaxa designations ("subsp.", "var.", "f.", and "ser"), -#' special characters (e.g., "-", ".", "(", ")", "?"), and extra whitespace. The resulting vector -#' of names is also converted to lowercase. +#' @details +#' Given a vector of taxonomic names, this function removes: +#' - subtaxa designations ("subsp.", "var.", "f.", and "ser") +#' - special characters (e.g., "-", ".", "(", ")", "?") +#' - extra whitespace +#' +#' The resulting vector of names is also converted to lowercase. #' #' @param taxon_names A character vector of taxonomic names to be stripped. #' -#' @return A character vector of stripped taxonomic names, with subtaxa designations, special -#' characters, and extra whitespace removed, and all letters converted to lowercase. +#' @return A character vector of stripped taxonomic names, +#' with subtaxa designations, special characters, and extra whitespace +#' removed, and all letters converted to lowercase. #' #' #' @examples @@ -17,60 +26,64 @@ #' #' @export strip_names <- function(taxon_names) { + + f <- function(x, find, replace) { + gsub(find, replace, x, perl = TRUE) + } + taxon_names %>% - stringr::str_replace_all("\\.", "") %>% - stringr::str_replace_all("\\ \\)", "") %>% - stringr::str_replace_all("\\(\\ ", "") %>% + f("\\.", "") %>% + f("\\ \\)", "") %>% + f("\\(\\ ", "") %>% stringr::str_replace_all("[:punct:]", " ") %>% stringr::str_replace_all("\\u2215", " ") %>% - stringr::str_replace_all("\\,", "") %>% - stringr::str_replace_all("\\=", " ") %>% - stringr::str_replace_all(" ", " ") %>% - stringr::str_replace_all(" subsp ", " ") %>% - stringr::str_replace_all(" var ", " ") %>% - stringr::str_replace_all(" ser ", " ") %>% - stringr::str_replace_all(" f ", " ") %>% + f("\\,", "") %>% + f("\\=", " ") %>% + f(" ", " ") %>% + f(" subsp ", " ") %>% + f(" var ", " ") %>% + f(" ser ", " ") %>% + f(" f ", " ") %>% stringr::str_squish() %>% - tolower() + stringr::str_to_lower() } -#' Strip taxonomic names of taxon rank abbreviations and qualifiers, filler words and special characters -#' -#' Given a vector of taxonomic names, this function removes subtaxa designations ("subsp.", "var.", "f.", and "ser"), -#' additional filler words and characters (" x " for hybrid taxa, "sp."), -#' special characters (e.g., "-", ".", "(", ")", "?"), and extra whitespace. The resulting vector -#' of names is also converted to lowercase. +#' @title Strip taxon names, extra +#' +#' @description +#' Strip taxonomic names of `sp.` and hybrid symbols. This function assumes +#' that a character function has already been run through `strip_names`. +#' +#' @details +#' Given a vector of taxonomic names, this function removes additional filler +#' words (" x " for hybrid taxa, "sp.") not removed by the function +#' `strip_names` #' #' @param taxon_names A character vector of taxonomic names to be stripped. #' -#' @return A character vector of stripped taxonomic names, with subtaxa designations, special -#' characters, additional filler words and extra whitespace removed, and all letters converted to lowercase. +#' @return A character vector of stripped taxonomic names, +#' with `sp.` and hybrid symbols removed. #' #' #' @examples -#' strip_names_2(c("Abies lasiocarpa subsp. lasiocarpa", +#' strip_names_extra(c("Abies lasiocarpa subsp. lasiocarpa", #' "Quercus kelloggii", #' "Pinus contorta var. latifolia", #' "Acacia sp.", #' "Lepidium sp. Tanguin Hill (K.R.Newbey 10501)")) #' #' @export -strip_names_2 <- function(taxon_names) { +strip_names_extra <- function(taxon_names) { + + f <- function(x, find, replace) { + gsub(find, replace, x, perl = TRUE) + } + taxon_names %>% - stringr::str_replace_all("\\.", "") %>% - stringr::str_replace_all("[:punct:]", " ") %>% - stringr::str_replace_all("\\u2215", " ") %>% - stringr::str_replace_all(" subsp ", " ") %>% - stringr::str_replace_all(" var ", " ") %>% - stringr::str_replace_all(" ser ", " ") %>% - stringr::str_replace_all(" f ", " ") %>% - stringr::str_replace_all(" species ", " ") %>% - stringr::str_replace_all(" x ", " ") %>% - stringr::str_replace_all(" sp ", " ") %>% - stringr::str_replace_all(" sp1", " 1") %>% - stringr::str_replace_all(" sp2", " 2") %>% - stringr::str_replace_all("\\=", " ") %>% - stringr::str_replace_all(" ", " ") %>% - stringr::str_squish() %>% - tolower() + f(" species ", " ") %>% + f(" x ", " ") %>% + f(" sp ", " ") %>% + f(" sp1", " 1") %>% + f(" sp2", " 2") %>% + stringr::str_squish() } diff --git a/R/update_taxonomy.R b/R/update_taxonomy.R index 8c232e97..80012ab3 100644 --- a/R/update_taxonomy.R +++ b/R/update_taxonomy.R @@ -1,50 +1,92 @@ -#' For a list of taxon names aligned to the APC, update the name to an accepted taxon concept per the APC and add scientific name and taxon concept metadata to names aligned to either the APC or APNI. +#' @title Update to currently accepted APC name and add APC/APNI name metadata +#' +#' @description +#' For a list of taxon names aligned to the APC, update the name to an accepted +#' taxon concept per the APC and add scientific name and taxon concept metadata +#' to names aligned to either the APC or APNI. #' -#' This function uses the APC to update the taxonomy of names aligned to a taxon concept listed in the APC to the currently accepted name for the taxon concept. -#' The aligned_data data frame that is input must contain 5 columns, -#' `original_name`, `aligned_name`, `taxon_rank`, `taxonomic_dataset`, and `aligned_reason`. -#' The aligned name is a plant name that has been aligned to a taxon name in the APC or APNI by the align_taxa function. +#' @details +#' - This function uses the APC to update the taxonomy of names aligned to a +#' taxon concept listed in the APC to the currently accepted name for the taxon +#' concept. +#' - The aligned_data data frame that is input must contain 5 columns, +#' `original_name`, `aligned_name`, `taxon_rank`, `taxonomic_dataset`, and +#' `aligned_reason`. (These are the columns output by the function `align_taxa`.) +#' - The aligned name is a plant name that has been aligned to a taxon name in +#' the APC or APNI by the align_taxa function. +#' +#' Notes: +#' - As the input for this function is a table with 5 columns (output by +#' align_taxa), this function will only be used when you explicitly want to +#' separate the aligment and updating components of APCalign. This function is +#' the second half of create_taxonomic_update_lookup. #' #' @family taxonomic alignment functions #' -#' @param aligned_data A tibble of plant names to update. This table must include 5 columns, original_name, aligned_name, taxon_rank, taxonomic_dataset, and aligned_reason. +#' @param aligned_data A tibble of plant names to update. This table must +#' include 5 columns, original_name, aligned_name, taxon_rank, +#' taxonomic_dataset, and aligned_reason. #' These columns are created by the function `align_taxa`. -#' The columns `original_name` and `aligned_name` must be in the format of the scientific name, with genus and species, -#' and may contain additional qualifiers such as subspecies or varieties. The names are case insensitive. -#' -#' @param taxonomic_splits Variable that determines what protocol to use to update taxon names that are ambiguous due to taxonomic splits. +#' The columns `original_name` and `aligned_name` must be in the format of the +#' scientific name, with genus and species, +#' and may contain additional qualifiers such as subspecies or varieties. The +#' names are case insensitive. +#' @param taxonomic_splits Variable that determines what protocol to use to +#' update taxon names that are ambiguous due to taxonomic splits. #' The three options are: -#' most_likely_species, which returns the species name in use before the split; alternative names are returned in a separate column -#' return_all, which returns all possible names -#' collapse_to_higher_taxon, which declares that an ambiguous name cannot be aligned to an accepted species/infraspecific name and the name is demoted to genus rank -#' -#' @param output (optional) Name of the file where results are saved. The default is NULL and no file is created. -#' If specified, the output will be saved in a CSV file with the given name. +#' - `most_likely_species`, which returns the species name in use before the +#' split; alternative names are returned in a separate column +#' - `return_all`, which returns all possible names +#' - `collapse_to_higher_taxon`, which declares that an ambiguous name cannot +#' be aligned to an accepted species/infraspecific name and the name is +#' demoted to genus rank +#' @param quiet Logical to indicate whether to display messages while updating +#' taxa. +#' @param output (optional) Name of the file where results are saved. The +#' default is NULL and no file is created. If specified, the output will be +#' saved in a CSV file with the given name. +#' @param resources the taxonomic resources required to make the summary +#' statistics. Loading this can be slow, so call load_taxonomic_resources +#' separately to greatly speed this function up and pass the resources in. #' -#' @param resources the taxonomic resources required to make the summary statistics. Loading this can be slow, so call load_taxonomic_resources separately to greatly speed this function up and pass the resources in. #' -#' -#' @return A tibble with updated taxonomy for the specified plant names. The tibble contains the following columns: +#' @return A tibble with updated taxonomy for the specified plant names. The +#' tibble contains the following columns: #' - original_name: the original plant name. -#' - aligned_name: the input plant name that has been aligned to a taxon name in the APC or APNI by the align_taxa function. +#' - aligned_name: the input plant name that has been aligned to a taxon name +#' in the APC or APNI by the align_taxa function. #' - accepted_name: the APC-accepted plant name, when available. -#' - suggested_name: the suggested plant name to use. Identical to the accepted_name, when an accepted_name exists; otherwise the the suggested_name is the aligned_name. -#' - genus: the genus of the accepted (or suggested) name; only APC-accepted genus names are filled in. -#' - family: the family of the accepted (or suggested) name; only APC-accepted family names are filled in. +#' - suggested_name: the suggested plant name to use. Identical to the +#' accepted_name, when an accepted_name exists; otherwise the the suggested_name +#' is the aligned_name. +#' - genus: the genus of the accepted (or suggested) name; only APC-accepted +#' genus names are filled in. +#' - family: the family of the accepted (or suggested) name; only APC-accepted +#' family names are filled in. #' - taxon_rank: the taxonomic rank of the suggested (and accepted) name. -#' - taxonomic_dataset: the source of the suggested (and accepted) names (APC or APNI). +#' - taxonomic_dataset: the source of the suggested (and accepted) names (APC or +#' APNI). #' - taxonomic_status: the taxonomic status of the suggested (and accepted) name. -#' - taxonomic_status_aligned: the taxonomic status of the aligned name, before any taxonomic updates have been applied. -#' - aligned_reason: the explanation of a specific taxon name alignment (from an original name to an aligned name). -#' - update_reason: the explanation of a specific taxon name update (from an aligned name to an accepted or suggested name). +#' - taxonomic_status_aligned: the taxonomic status of the aligned name, before +#' any taxonomic updates have been applied. +#' - aligned_reason: the explanation of a specific taxon name alignment (from an +#' original name to an aligned name). +#' - update_reason: the explanation of a specific taxon name update (from an +#' aligned name to an accepted or suggested name). #' - subclass: the subclass of the accepted name. -#' - taxon_distribution: the distribution of the accepted name; only filled in if an APC accepted_name is available. -#' - scientific_name_authorship: the authorship information for the accepted (or synonymous) name; available for both APC and APNI names. -#' - taxon_ID: the unique taxon concept identifier for the accepted_name; only filled in if an APC accepted_name is available. -#' - taxon_ID_genus: an identifier for the genus; only filled in if an APC-accepted genus name is available. -#' - scientific_name_ID: an identifier for the nomenclatural (not taxonomic) details of a scientific name; available for both APC and APNI names. +#' - taxon_distribution: the distribution of the accepted name; only filled in +#' if an APC accepted_name is available. +#' - scientific_name_authorship: the authorship information for the accepted +#' (or synonymous) name; available for both APC and APNI names. +#' - taxon_ID: the unique taxon concept identifier for the accepted_name; only +#' filled in if an APC accepted_name is available. +#' - taxon_ID_genus: an identifier for the genus; only filled in if an +#' APC-accepted genus name is available. +#' - scientific_name_ID: an identifier for the nomenclatural (not taxonomic) +#' details of a scientific name; available for both APC and APNI names. #' - row_number: the row number of a specific original_name in the input. -#' - number_of_collapsed_taxa: when taxonomic_splits == "collapse_to_higher_taxon", the number of possible taxon names that have been collapsed. +#' - number_of_collapsed_taxa: when taxonomic_splits == "collapse_to_higher_taxon", +#' the number of possible taxon names that have been collapsed. #' #' #' @seealso load_taxonomic_resources @@ -53,19 +95,25 @@ #' #' @examples #' # Update taxonomy for two plant names and print the result -#' \donttest{update_taxonomy( -#' tibble::tibble( +#' \donttest{ +#' resources <- load_taxonomic_resources() +#' +#' update_taxonomy( +#' dplyr::tibble( #' original_name = c("Dryandra preissii", "Banksia acuminata"), #' aligned_name = c("Dryandra preissii", "Banksia acuminata"), #' taxon_rank = c("species", "species"), #' taxonomic_dataset = c("APC", "APC"), -#' aligned_reason = NA_character_ -#' ) +#' aligned_reason = c(NA_character_, +#' NA_character_) +#' ), +#' resources = resources #' ) #' } update_taxonomy <- function(aligned_data, taxonomic_splits = "most_likely_species", + quiet = TRUE, output = NULL, resources = load_taxonomic_resources()) { @@ -116,7 +164,7 @@ update_taxonomy <- function(aligned_data, ## create a blank tibble with all columns, for taxon lists where some columns aren't created in any of the individual tibbles taxa_blank <- - tibble::tibble( + dplyr::tibble( original_name = character(0L), aligned_name = character(0L), accepted_name = character(0L), @@ -191,8 +239,11 @@ update_taxonomy <- function(aligned_data, taxa_out <- taxa_out %>% dplyr::arrange(row_number) if (!is.null(output)) { + taxa_out$checked<-TRUE + taxa_out$known<-!is.na(taxa_out$accepted_name) readr::write_csv(taxa_out, output) - message(" - output saved in file: ", output) + if(!quiet) + message(" - output saved in file: ", output) } taxa_out @@ -229,8 +280,7 @@ relevel_taxonomic_status_preferred_order <- function(taxonomic_status) { "included" ) - forcats::fct_relevel( - taxonomic_status, + factor(taxonomic_status, levels = subset( preferred_order, preferred_order %in% taxonomic_status @@ -329,16 +379,44 @@ update_taxonomy_APC_family <- function(data, resources) { if(is.null(data)) return(NULL) + families <- resources$family_accepted %>% + dplyr::bind_rows(resources$family_synonym) %>% + dplyr::mutate(family = genus) + data %>% dplyr::mutate( - suggested_name = aligned_name, - accepted_name = NA_character_, family = genus, genus = NA_character_, - taxonomic_status_genus = NA_character_, + taxonomic_status_genus = NA_character_ + ) %>% + dplyr::left_join( + by = "family", + families %>% + dplyr::arrange(canonical_name, taxonomic_status) %>% + dplyr::distinct(canonical_name, .keep_all = TRUE) %>% + dplyr::select( + family, + accepted_name_usage_ID, + taxonomic_status + ) + ) %>% + dplyr::mutate(my_order = relevel_taxonomic_status_preferred_order(taxonomic_status)) %>% + dplyr::arrange(aligned_name, my_order) %>% + dplyr::mutate( + # if required, update the family name in the `aligned_name` to the currently APC-accepted family + family_accepted = families$canonical_name[match(accepted_name_usage_ID, families$taxon_ID)] + ) %>% + dplyr::mutate( + accepted_name = NA_character_, + # family names in `aligned_name` that are not APC-accepted need to be updated to their current name in `suggested_name` + aligned_minus_genus = stringr::str_replace(aligned_name, family, ""), + # if there is an APC-accepted genus, replace whatever the initial genus was with the accepted genus, otherwise the suggested name is the aligned name + suggested_name = ifelse(my_order == "accepted", aligned_name, paste0(family_accepted, aligned_minus_genus)), taxonomic_status = "family accepted", - taxonomic_dataset = "APC" - ) + taxonomic_dataset = "APC", + family = family_accepted + ) %>% + dplyr::select(-accepted_name_usage_ID, -family_accepted, -my_order) } # Function to update names of taxa whose aligned_names are @@ -424,7 +502,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, dplyr::group_by(canonical_name) %>% dplyr::mutate( number_of_collapsed_taxa = sum(number_of_collapsed_taxa), - accepted_name_2 = paste(stringr::word(accepted_name_2, 1), "sp."), + accepted_name_2 = paste(word(accepted_name_2, 1), "sp."), alternative_possible_names = alternative_accepted_name_tmp %>% unique() %>% @@ -437,7 +515,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, dplyr::mutate( alternative_possible_names = ifelse(taxonomic_status_aligned != "accepted" & canonical_name %in% resources$'APC list (accepted)'$canonical_name, NA, alternative_possible_names), alternative_possible_names = stringr::str_replace_all(alternative_possible_names, "\\ \\|\\ NA", ""), - suggested_collapsed_name = paste(stringr::word(accepted_name_2, 1), "sp. [collapsed names:", alternative_possible_names, "]"), + suggested_collapsed_name = paste(word(accepted_name_2, 1), "sp. [collapsed names:", alternative_possible_names, "]"), taxon_rank = ifelse(number_of_collapsed_taxa > 1 & species_and_infraspecific(taxon_rank), "genus", taxon_rank) ) %>% dplyr::select(-alternative_accepted_name_tmp, -alternative_possible_names) @@ -529,7 +607,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, ## there are rare cases of names within the APC that do not align to an accepted name. ## For these taxa, the `suggested_name` is the `aligned_name` and the family name must be added genus = ifelse(is.na(genus_accepted), genus, genus_accepted), - family = ifelse(is.na(family), resources$APC$family[match(stringr::word(suggested_name, 1), resources$APC$genus)], family), + family = ifelse(is.na(family), resources$APC$family[match(word(suggested_name, 1), resources$APC$genus)], family), update_reason = ifelse( (number_of_collapsed_taxa > 1) & !is.na(number_of_collapsed_taxa), "collapsed to genus due to ambiguity", @@ -539,7 +617,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources, ## next line just in case duplication snuck in - there are rare cases where one of the left_joins duplicates a row dplyr::distinct(row_number, original_name, aligned_name, accepted_name, .keep_all = TRUE) %>% dplyr::select(original_name, aligned_name, suggested_name, accepted_name, accepted_name_2, - taxonomic_status, taxonomic_status_aligned, taxon_rank, number_of_collapsed_taxa, everything()) + taxonomic_status, taxonomic_status_aligned, taxon_rank, number_of_collapsed_taxa, dplyr::everything()) } # Function to update names of taxa whose aligned_names are @@ -577,7 +655,7 @@ update_taxonomy_APNI_species_and_infraspecific_taxa <- function(data, resources) aligned_name, suggested_name ), - genus = stringr::word(suggested_name, 1) + genus = word(suggested_name, 1) ) %>% # when possible the genus of APNI names is matched to an APC-accepted genus and the appropriate genus-level taxon_ID is added dplyr::left_join( diff --git a/R/word.R b/R/word.R new file mode 100644 index 00000000..a8f51bf3 --- /dev/null +++ b/R/word.R @@ -0,0 +1,47 @@ +#' Extract words from a sentence. Intended as a faster +#' replacement for stringr::word +#' +#' @param string A character vector +#' @param start,end Pair of integer vectors giving range of words (inclusive) +#' to extract. The default value select the first word. +#' @param sep Separator between words. Defaults to single space. +#' @return A character vector with the same length as `string`/`start`/`end`. +#' @noRd +#' @examples +#' spp <- c("Banksia serrata", "Actinotus helanthii") +#' APCalign:::word(spp, 1) +#' APCalign:::word(spp, 2) +#' @noRd +word <- function(string, start = 1L, end = start, sep = " ") { + if(end == start) { + stringr::str_split_i(string, " ", start) + } else if(end == start+1) { + w1 <- stringr::str_split_i(string, sep, start) + w2 <- stringr::str_split_i(string, sep, start+1) + + out <- paste(w1, w2) + out[is.na(w2)] <- NA_character_ + + return(out) + } else if(end == start+2) { + + w1 <- stringr::str_split_i(string, sep, start) + w2 <- stringr::str_split_i(string, sep, start+1) + w3 <- stringr::str_split_i(string, sep, start+2) + + out <- paste(w1, w2, w3) + out[is.na(w2) | is.na(w3)] <- NA_character_ + + return(out) + } else { + i <- seq(start, end) + + txt <- stringr::str_split(string, sep) + out <- purrr::map(txt, ~paste(.x[i], collapse = sep)) + + lngth <- purrr::map_int(txt, length) + out[lngth < end] <- NA + + return(out) + } +} diff --git a/README.Rmd b/README.Rmd index 107a2799..182928d5 100644 --- a/README.Rmd +++ b/README.Rmd @@ -30,37 +30,73 @@ the established status (native/introduced) of plant taxa across different states ## Installation -'APCalign' is current not on CRAN. Install the currently development version: +For Windows and Linux: ```{r install, eval= FALSE} + # install.packages("remotes") -# remotes::install_github("traitecoevo/APCalign") +# remotes::install_github("traitecoevo/APCalign", dependencies = TRUE, upgrade = "ask") + +``` + +for MacOS there is currently an extra line needed to install a working binary of the `arrow` dependency from r-universe instead of CRAN: + +```{r install_mac, eval= FALSE} + +# install.packages("arrow", repos = c('https://apache.r-universe.dev', 'https://cloud.r-project.org')) +# remotes::install_github("traitecoevo/APCalign", dependencies = TRUE, upgrade = "ask") -library(APCalign) ``` + ## A quick demo -Generating a look-up table can be done with just one function +Generating a look-up table can be done with just one function: + +```{r} -```{r,message=FALSE} -# Load APC/APNI resources into R -resources <- load_taxonomic_resources() +library(APCalign) -# Create lookup create_taxonomic_update_lookup( taxa = c( "Banksia integrifolia", "Acacia longifolia", "Commersonia rosea" + ) +) +``` + +if you're going to use APCalign more than once, it will save you time to load the taxonomic resources into memory first: + +```{r} + +tax_resources <- load_taxonomic_resources() + +create_taxonomic_update_lookup( + taxa = c( + "Banksia integrifolia", + "Acacia longifolia", + "Commersonia rosea", + "not a species" ), - resources = resources + resources = tax_resources ) ``` +Checking for Australian natives: + +```{r, message=FALSE} + +native_anywhere_in_australia(c("Eucalyptus globulus","Pinus radiata"), resources = tax_resources) + +``` +## Shiny application + +We also developed a shiny application for non-R users to update and align their taxonomic names. You can find the application here: https://unsw.shinyapps.io/APCalign-app + ## Learn more -Highly recommend looking at our [Getting Started](https://traitecoevo.github.io/APCalign/articles/APCalign.html) vignette to learn about how to use 'APCalign'. You can also learn more about our [taxa matching algorithm](https://traitecoevo.github.io/APCalign/articles/updating-taxon-names.html) and how [APC/APNI data is cached](https://traitecoevo.github.io/APCalign/articles/caching.html) behind-the-scenes. +Highly recommend looking at our [Getting Started](https://traitecoevo.github.io/APCalign/articles/APCalign.html) vignette to learn about how to use 'APCalign'. You can also learn more about our [taxa matching algorithm](https://traitecoevo.github.io/APCalign/articles/updating-taxon-names.html). ## Found a bug? diff --git a/README.md b/README.md index 0b9ce831..90a13d13 100644 --- a/README.md +++ b/README.md @@ -20,54 +20,114 @@ taxa across different states/territories. ## Installation -‘APCalign’ is current not on CRAN. Install the currently development -version: +For Windows and Linux: ``` r + # install.packages("remotes") -# remotes::install_github("traitecoevo/APCalign") +# remotes::install_github("traitecoevo/APCalign", dependencies = TRUE, upgrade = "ask") +``` -library(APCalign) +for MacOS there is currently an extra line needed to install a working +binary of the `arrow` dependency from r-universe instead of CRAN: + +``` r + +# install.packages("arrow", repos = c('https://apache.r-universe.dev', 'https://cloud.r-project.org')) +# remotes::install_github("traitecoevo/APCalign", dependencies = TRUE, upgrade = "ask") ``` ## A quick demo -Generating a look-up table can be done with just one function +Generating a look-up table can be done with just one function: ``` r -# Load APC/APNI resources into R -resources <- load_taxonomic_resources() -# Create lookup +library(APCalign) + create_taxonomic_update_lookup( taxa = c( "Banksia integrifolia", "Acacia longifolia", "Commersonia rosea" - ), - resources = resources + ) ) -#> # A tibble: 3 × 12 -#> original_name aligned_name accepted_name suggested_name genus taxon_rank -#> -#> 1 Banksia integrifol… Banksia int… Banksia inte… Banksia integ… Bank… species -#> 2 Acacia longifolia Acacia long… Acacia longi… Acacia longif… Acac… species -#> 3 Commersonia rosea Commersonia… Androcalva r… Androcalva ro… Andr… species -#> # ℹ 6 more variables: taxonomic_dataset , taxonomic_status , -#> # scientific_name , aligned_reason , update_reason , -#> # number_of_collapsed_taxa +#> Checking alignments of 3 taxa +``` + + #> Loading resources into memory... + #> ================================================================================================================================================================ + #> ...done + #> -> of these 2 names have a perfect match to a scientific name in the APC. Alignments being sought for remaining names. + #> # A tibble: 3 × 12 + #> original_name aligned_name accepted_name suggested_name genus taxon_rank + #> + #> 1 Banksia integrifol… Banksia int… Banksia inte… Banksia integ… Bank… species + #> 2 Acacia longifolia Acacia long… Acacia longi… Acacia longif… Acac… species + #> 3 Commersonia rosea Commersonia… Androcalva r… Androcalva ro… Andr… species + #> # ℹ 6 more variables: taxonomic_dataset , taxonomic_status , + #> # scientific_name , aligned_reason , update_reason , + #> # number_of_collapsed_taxa + +if you’re going to use APCalign more than once, it will save you time to +load the taxonomic resources into memory first: + +``` r + +tax_resources <- load_taxonomic_resources() +``` + + #> Loading resources into memory... + #> ================================================================================================================================================================ + #> ...done + + create_taxonomic_update_lookup( + taxa = c( + "Banksia integrifolia", + "Acacia longifolia", + "Commersonia rosea", + "not a species" + ), + resources = tax_resources + ) + #> Checking alignments of 4 taxa + #> -> of these 2 names have a perfect match to a scientific name in the APC. Alignments being sought for remaining names. + #> # A tibble: 4 × 12 + #> original_name aligned_name accepted_name suggested_name genus taxon_rank + #> + #> 1 Banksia integrifol… Banksia int… Banksia inte… Banksia integ… Bank… species + #> 2 Acacia longifolia Acacia long… Acacia longi… Acacia longif… Acac… species + #> 3 Commersonia rosea Commersonia… Androcalva r… Androcalva ro… Andr… species + #> 4 not a species + #> # ℹ 6 more variables: taxonomic_dataset , taxonomic_status , + #> # scientific_name , aligned_reason , update_reason , + #> # number_of_collapsed_taxa + +Checking for Australian natives: + +``` r + +native_anywhere_in_australia(c("Eucalyptus globulus","Pinus radiata"), resources = tax_resources) +#> # A tibble: 2 × 2 +#> species native_anywhere_in_aus +#> +#> 1 Eucalyptus globulus native +#> 2 Pinus radiata introduced ``` +## Shiny application + +We also developed a shiny application for non-R users to update and +align their taxonomic names. You can find the application here: + + ## Learn more Highly recommend looking at our [Getting Started](https://traitecoevo.github.io/APCalign/articles/APCalign.html) vignette to learn about how to use ‘APCalign’. You can also learn more about our [taxa matching -algorithm](https://traitecoevo.github.io/APCalign/articles/updating-taxon-names.html) -and how [APC/APNI data is -cached](https://traitecoevo.github.io/APCalign/articles/caching.html) -behind-the-scenes. +algorithm](https://traitecoevo.github.io/APCalign/articles/updating-taxon-names.html). ## Found a bug? diff --git a/_pkgdown.yml b/_pkgdown.yml index 4e5a7034..8028a94d 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -16,32 +16,33 @@ navbar: articles: text: Articles menu: - - text: "Data providers" - - text: APC and APNI - href: articles/data-providers.html - - text: "Functions" - - text: Details on the 10 exported functions, including examples of usage - href: articles/function_notes.html - - text: ------- - - text: "Taxon matching" - - text: Our fuzzy matching algorithm + - text: Data sources (APC & APNI) + href: articles/articles/data-providers.html + - text: Taxon matching href: articles/updating-taxon-names.html + - text: Using APC versions for reproducibility + href: articles/reproducibility.html + reference: -- subtitle: Standardise plant taxon names +- subtitle: Align and update taxon names - contents: - - load_taxonomic_resources - create_taxonomic_update_lookup - align_taxa - update_taxonomy +- subtitle: Standardise and simplify plant taxon names +- contents: - standardise_names + - standardise_taxon_rank - strip_names - - strip_names_2 + - strip_names_extra - subtitle: Established status across states/territories - contents: - create_species_state_origin_matrix - state_diversity_counts - native_anywhere_in_australia -- title: Data +- subtitle: Data - contents: + - load_taxonomic_resources + - default_version - gbif_lite diff --git a/inst/CITATION b/inst/CITATION new file mode 100644 index 00000000..4dd88ad2 --- /dev/null +++ b/inst/CITATION @@ -0,0 +1,17 @@ +bibentry( + bibtype = "Unpublished", + title = "APCalign: an R package workflow and app for aligning and updating flora names to the Australian Plant Census", + author = c( + person(given = "Elizabeth", family = "Wenk", role = c("aut", "ctb"), email = "e.wenk@unsw.edu.au", comment = c(ORCID = "0000-0001-5640-5910")), + person(given = "Will", family = "Cornwell", role = c("aut", "ctb"), email = "w.cornwell@unsw.edu.au", comment = c(ORCID = "0000-0003-4080-4073")), + person(given = "Ann", family= "Fuchs", role = c("aut"), email = "anne.fuchs@dcceew.gov.au", comment = c(ORCID = "0000-0001-5737-8803")), + person(given = "Fonti", family = "Kar", role = c("aut", "ctb"), email = "f.kar@unsw.edu.au", comment = c(ORCID = "0000-0002-2760-3974")), + person(given = "Anna", family= "Monro", role = c("aut"), email = "anna.monro@dcceew.gov.au", comment = c(ORCID = "0000-0001-9031-2670")), + person(given = "Herve", family= "Sauquet", role = c("aut"), email = "herve.sauquet@botanicgardens.nsw.gov.au", comment = c(ORCID = "0000-0001-8305-3236")), + person(given = "Ruby", family= "Stephens", role = c("aut"), email = "stephenseruby@gmail.com", comment = c(ORCID = "0000-0002-3767-2690")), + person(given = "Daniel", family = "Falster", role = c("aut", "cre", "cph"), email = "daniel.falster@unsw.edu.au", comment = c(ORCID = "0000-0002-9814-092X")) + ), + year = 2024, + note = paste("R package version:", packageVersion("APCalign")), + url = "https://www.biorxiv.org/content/10.1101/2024.02.02.578715v1" +) diff --git a/inst/extdata/match_taxa_documentation.csv b/inst/extdata/match_taxa_documentation.csv index 582811b5..16ee4e06 100644 --- a/inst/extdata/match_taxa_documentation.csv +++ b/inst/extdata/match_taxa_documentation.csv @@ -48,5 +48,8 @@ match_12a,"Detect genus, by checking the first word in the string","first word ( match_12b,"Detect genus, by checking the first word in the string","first word (""genus"")",exact,other APC taxon concepts,genus, match_12c,"Detect genus, by checking the first word in the string","first word (""genus"")",exact,APNI,genus, match_12d,"Detect family, by checking the first word in the string","first word (""genus"")",exact,APC accepted taxon concepts,family, -match_12e,"Detect genus, by checking the first word in the string","first word (""genus"")",fuzzy,APC accepted taxon concepts,genus, -match_12f,"Detect genus, by checking the first word in the string","first word (""genus"")",fuzzy,other APC taxon concepts,genus, +match_12e,"Detect family, by checking the first word in the string","first word (""genus"")",exact,other APC taxon concepts,family, +match_12f,"Detect genus, by checking the first word in the string","first word (""genus"")",fuzzy,APC accepted taxon concepts,genus, +match_12g,"Detect genus, by checking the first word in the string","first word (""genus"")",fuzzy,other APC taxon concepts,genus, +match_12h,"Detect family, by checking the first word in the string","first word (""genus"")",fuzzy,APC accepted taxon concepts,family, +match_12i,"Detect family, by checking the first word in the string","first word (""genus"")",fuzzy,other APC taxon concepts,family, diff --git a/inst/extdata/test_taxa.csv b/inst/extdata/test_taxa.csv index b1dabc09..f88fc9f5 100644 --- a/inst/extdata/test_taxa.csv +++ b/inst/extdata/test_taxa.csv @@ -1,33 +1,33 @@ -original_name -Banksia serrata -Banksia serrate -Banksee serrate -Banksia cerrata -Banksia sp. -Dryandra sp. -Argyrodendron (Whyanbeel) -Argyrodendron ssp. (Whyanbeel BH 1106RFK) -Argyrodendron Whyanbeel -Argyrodendron sp. (Whyanbeel BH 1106RFK) -Argyrodendron sp. Whyanbeel (B.P.Hyland RFK 1106) -Argyrodendron sp. Whyanbeel (B.P.Hyland RFK1106) -Dryandra aurantia -Banksia aurantia -Dryandra blechnifolia -Banksia pellaeifolia -Dryandra idiogenes -Banksia idiogenes -Dryandra lindleyana -Banksia dallanneyi -Acacia aneura -Acacia minyura -Acacia paraneura -Racosperma aneurum -Acacia aneura var. intermedia -Banksia (has long pink leaves) -Dryandra (has long pink leaves) -Acacia minyura / Acacia paraneura -Acacia aphanoclada x Acacia pyrifolia var. pyrifolia -Acacia minyura x Acacia paraneura -"no clue, a monocot" -Orchidaceae (epiphtye) +original_name,notes +Banksia serrata,notes_01 +Banksia serrate,notes_02 +Banksee serrate,notes_03 +Banksia cerrata,notes_04 +Banksia sp.,notes_05 +Dryandra sp.,notes_06 +Argyrodendron (Whyanbeel) ,notes_07 +Argyrodendron ssp. (Whyanbeel BH 1106RFK) ,notes_08 +Argyrodendron Whyanbeel ,notes_09 +Argyrodendron sp. (Whyanbeel BH 1106RFK) ,notes_10 +Argyrodendron sp. Whyanbeel (B.P.Hyland RFK 1106),notes_11 +Argyrodendron sp. Whyanbeel (B.P.Hyland RFK1106) ,notes_12 +Dryandra aurantia,notes_13 +Banksia aurantia,notes_14 +Dryandra blechnifolia,notes_15 +Banksia pellaeifolia,notes_16 +Dryandra idiogenes,notes_17 +Banksia idiogenes,notes_18 +Dryandra lindleyana,notes_19 +Banksia dallanneyi,notes_20 +Acacia aneura,notes_21 +Acacia minyura,notes_22 +Acacia paraneura,notes_23 +Racosperma aneurum,notes_24 +Acacia aneura var. intermedia,notes_25 +Banksia (has long pink leaves),notes_26 +Dryandra (has long pink leaves),notes_27 +Acacia minyura / Acacia paraneura,notes_28 +Acacia aphanoclada x Acacia pyrifolia var. pyrifolia,notes_29 +Acacia minyura x Acacia paraneura,notes_30 +"no clue, a monocot",notes_31 +Orchidaceae (epiphtye),notes_32 diff --git a/man/APCalign.Rd b/man/APCalign.Rd index d9730198..9936d5a7 100644 --- a/man/APCalign.Rd +++ b/man/APCalign.Rd @@ -2,8 +2,8 @@ % Please edit documentation in R/APCalign-package.R \docType{package} \name{APCalign} -\alias{APCalign} \alias{APCalign-package} +\alias{APCalign} \title{Standardising Taxonomic Names in Australian Plants} \description{ The process of standardising taxon names is necessary when working with @@ -32,7 +32,8 @@ the established status of plant taxa across different states/territories. \references{ If you have any questions, comments or suggestions, please -submit an issue at our \href{https://github.com/traitecoevo/APCalign/issues}{GitHub repository} +submit an issue at our +\href{https://github.com/traitecoevo/APCalign/issues}{GitHub repository} } \seealso{ Useful links: diff --git a/man/align_taxa.Rd b/man/align_taxa.Rd index 2d165adb..c16a3240 100644 --- a/man/align_taxa.Rd +++ b/man/align_taxa.Rd @@ -2,13 +2,14 @@ % Please edit documentation in R/align_taxa.R \name{align_taxa} \alias{align_taxa} -\title{For a list of Australian plant names, find taxonomic or scientific name alignments to the APC or APNI through standardizing formatting and fixing spelling errors} +\title{Align Australian plant scientific names to the APC or APNI} \usage{ align_taxa( original_name, output = NULL, full = FALSE, resources = load_taxonomic_resources(), + quiet = FALSE, fuzzy_abs_dist = 3, fuzzy_rel_dist = 0.2, fuzzy_matches = TRUE, @@ -24,63 +25,188 @@ align_taxa( \item{full}{Parameter to determine how many columns are output} -\item{resources}{the taxonomic resources used to align the taxa names. Loading this can be slow, -so call \code{\link{load_taxonomic_resources}} separately to greatly speed this function up and pass the resources in.} +\item{resources}{the taxonomic resources used to align the taxa names. +Loading this can be slow, so call \code{\link{load_taxonomic_resources}} +separately to greatly speed this function up and pass the resources in.} -\item{fuzzy_abs_dist}{The number of characters allowed to be different for a fuzzy match.} +\item{quiet}{Logical to indicate whether to display messages while +aligning taxa.} -\item{fuzzy_rel_dist}{The proportion of characters allowed to be different for a fuzzy match.} +\item{fuzzy_abs_dist}{The number of characters allowed to be different for a +fuzzy match.} -\item{fuzzy_matches}{Fuzzy matches are turned on as a default. The relative and absolute distances allowed for fuzzy matches to species and infraspecific taxon names are defined by the parameters \code{fuzzy_abs_dist} and \code{fuzzy_rel_dist}} +\item{fuzzy_rel_dist}{The proportion of characters allowed to be different +for a fuzzy match.} -\item{imprecise_fuzzy_matches}{Imprecise fuzzy matches are turned off as a default.} +\item{fuzzy_matches}{Fuzzy matches are turned on as a default. +The relative and absolute distances allowed for fuzzy matches to species and +infraspecific taxon names are defined by the parameters \code{fuzzy_abs_dist} +and \code{fuzzy_rel_dist}} -\item{APNI_matches}{Name matches to the APNI (Australian Plant Names Index) are turned off as a default.} +\item{imprecise_fuzzy_matches}{Imprecise fuzzy matches uses the +fuzzy matching function with lenient levels set (absolute distance of +5 characters; relative distance = 0.25). +It offers a way to get a wider range of possible names, possibly +corresponding to very distant spelling mistakes. +This is FALSE as default and all outputs should be checked as it often +makes erroneous matches.} -\item{identifier}{A dataset, location or other identifier, which defaults to NA.} +\item{APNI_matches}{Name matches to the APNI (Australian Plant Names Index) +are turned on as a default.} + +\item{identifier}{A dataset, location or other identifier, +which defaults to NA.} } \value{ -A tibble with columns that include original_name, aligned_name, taxonomic_dataset, taxon_rank, aligned_reason, alignment_code. +A tibble with columns that include original_name, aligned_name, +taxonomic_dataset, taxon_rank, aligned_reason, alignment_code. \itemize{ \item original_name: the original plant name input. -\item aligned_name: the original plant name after the function standardise_names has standardised the syntax of infraspecific taxon designations. +\item aligned_name: the original plant name after the function standardise_names +has standardised the syntax of infraspecific taxon designations. \item taxonomic_dataset: the source of the aligned names (APC or APNI). \item taxon_rank: the taxonomic rank of the aligned name. -\item aligned_reason: the explanation of a specific taxon name alignment (from an original name to an aligned name). -\item alignment_code: a code that accompanies the aligned_reason, indicating the relative sequence of the match during the alignment process. -\item cleaned_name: original name with punctuation and infraspecific taxon designation terms standardised by the function standardise_names; streamlines exact matches. -\item stripped_name: cleaned name with punctuation and infraspecific taxon designation terms removed by the function strip_names; improves fuzzy matches. -\item stripped_name2: cleaned name with punctuation, infraspecific taxon designation terms, and other filler words removed by the function strip_names_2; required for matches to \verb{first two word} and \verb{first three words}. -\item trinomial: the first three words in \code{stripped_name2}, required for matches that ignore all other text in the original_name; improves phrase name matches. -\item binomial: the first two words in \code{stripped_name2}, required for matches that ignore all other text in the original_name; improves phrase name matches. -\item genus: the first two words in \code{cleaned_name}; required for genus-rank matches and reprocessing of genus-rank names. -\item fuzzy_match_genus: fuzzy match of genus column to best match among APC-accepted names; required for fuzzy matches of genus-rank names. -\item fuzzy_match_genus_synonym: fuzzy match of genus column to best match among APC-known names, only considering different matches to those documented under APC-accepted genera; required for fuzzy matches of genus-rank names. -\item fuzzy_match_genus_APNI: fuzzy match of genus column to best match among APNI names, only considering different matches to those documented under APC-accepted and APC-known genera; required for fuzzy matches of genus-rank names. -\item fuzzy_match_cleaned_APC: fuzzy match of stripped_name to APC-accepted names; created for yet-to-be-aligned names at the match step 07a in the function \code{match_taxa}. -\item fuzzy_match_cleaned_APC_synonym: fuzzy match of stripped_name to APC-known names; created for yet-to-be-aligned names at the match step 07b in the function \code{match_taxa}. -\item fuzzy_match_cleaned_APC_imprecise: imprecise fuzzy match of stripped_name to APC-accepted names; created for yet-to-be-aligned names at the match step 10a in the function \code{match_taxa}. -\item fuzzy_match_cleaned_APC_synonym_imprecise: imprecise fuzzy match of stripped_name to APC-accepted names; created for yet-to-be-aligned names at the match step 10b in the function \code{match_taxa}. -\item fuzzy_match_binomial: fuzzy match of binomial column to best match among APC-accepted names; created for yet-to-be-aligned names at match step 15a in the function \code{match_taxa}. -\item fuzzy_match_binomial_APC_synonym: fuzzy match of binomial column to best match among APC-known names; created for yet-to-be-aligned names at match step 15a in the function \code{match_taxa}. -\item fuzzy_match_trinomial: fuzzy match of trinomial column to best match among APC-accepted names; created for yet-to-be-aligned names at match step 16a in the function \code{match_taxa}. -\item fuzzy_match_trinomial_synonym: fuzzy match of trinomial column to best match among APC-known names; created for yet-to-be-aligned names at match step 16b in the function \code{match_taxa}. -\item fuzzy_match_cleaned_APNI: fuzzy match of stripped_name to APNI names; created for yet-to-be-aligned names at the match step 16a in the function \code{match_taxa}. -\item fuzzy_match_cleaned_APNI_imprecise: imprecise fuzzy match of stripped_name to APNI names; created for yet-to-be-aligned names at the match step 17a in the function \code{match_taxa}. +\item aligned_reason: the explanation of a specific taxon name alignment +(from an original name to an aligned name). +\item alignment_code: a code that accompanies the aligned_reason, indicating the +relative sequence of the match during the alignment process. +\item cleaned_name: original name with punctuation and infraspecific taxon +designation terms standardised by the function standardise_names; +streamlines exact matches. +\item stripped_name: cleaned name with punctuation and infraspecific taxon +designation terms removed by the function strip_names; +improves fuzzy matches. +\item stripped_name2: cleaned name with punctuation, infraspecific taxon +designation terms, and other filler words removed by +the function \code{strip_names_extra}; +required for matches to \verb{first two word} and \verb{first three words}. +\item trinomial: the first three words in \code{stripped_name2}, required for matches +that ignore all other text in the original_name; +improves phrase name matches. +\item binomial: the first two words in \code{stripped_name2}, required for matches +that ignore all other text in the original_name; +improves phrase name matches. +\item genus: the first two words in \code{cleaned_name}; +required for genus-rank matches and reprocessing of genus-rank names. +\item fuzzy_match_genus: fuzzy match of genus column to best match among +APC-accepted names; +required for fuzzy matches of genus-rank names. +\item fuzzy_match_genus_synonym: fuzzy match of genus column to best match among +APC-synonymous names, only considering different matches to those documented +under APC-accepted genera; required for fuzzy matches of genus-rank names. +\item fuzzy_match_genus_APNI: fuzzy match of genus column to best match among +APNI names, only considering different matches to those documented under +APC-accepted and APC-known genera; required for fuzzy matches of +genus-rank names. +\item fuzzy_match_family: fuzzy match of genus column to best match among +APC-accepted family names; required for fuzzy matches of family-rank names. +\item fuzzy_match_family_synonym: fuzzy match of genus column to best match +among APC-synonymous family names; required for fuzzy matches of +family-rank names. +\item fuzzy_match_cleaned_APC: fuzzy match of stripped_name to APC-accepted +names; created for yet-to-be-aligned names at the match step 05a +in the function \code{match_taxa}. +\item fuzzy_match_cleaned_APC_synonym: fuzzy match of stripped_name to +APC-synonymous names; created for yet-to-be-aligned names at the +match step 05b in the function \code{match_taxa}. +\item fuzzy_match_cleaned_APC_imprecise: imprecise fuzzy match of stripped_name +to APC-accepted names; created for yet-to-be-aligned names at the +match step 07a in the function \code{match_taxa}. +\item fuzzy_match_cleaned_APC_synonym_imprecise: imprecise fuzzy match of +stripped_name to APC-accepted names; created for yet-to-be-aligned names +at the match step 07b in the function \code{match_taxa}. +\item fuzzy_match_binomial: fuzzy match of binomial column to best match among +APC-accepted names; created for yet-to-be-aligned names at +match step 10c in the function \code{match_taxa}. +\item fuzzy_match_binomial_APC_synonym: fuzzy match of binomial column to best +match among APC-synonymous names; created for yet-to-be-aligned names at +match step 10d in the function \code{match_taxa}. +\item fuzzy_match_trinomial: fuzzy match of trinomial column to best match +among APC-accepted names; created for yet-to-be-aligned names at +match step 09c in the function \code{match_taxa}. +\item fuzzy_match_trinomial_synonym: fuzzy match of trinomial column to best +match among APC-synonymous names; created for yet-to-be-aligned names at +match step 09d in the function \code{match_taxa}. +\item fuzzy_match_cleaned_APNI: fuzzy match of stripped_name to APNI names; +created for yet-to-be-aligned names at the match step 11a in the +function \code{match_taxa}. +\item fuzzy_match_cleaned_APNI_imprecise: imprecise fuzzy match of +stripped_name to APNI names; created for yet-to-be-aligned names +at the match step 11b in the function \code{match_taxa}. } } \description{ -This function finds taxonomic alignments in APC or scientific name alignments in APNI. -It uses the internal function \code{match_taxa} to attempt to match input strings to taxon names in the APC/APNI. -It sequentially searches for matches against more than 20 different string patterns, -prioritising exact matches (to accepted names as well as synonyms, orthographic variants) over fuzzy matches. -It prioritises matches to taxa in the APC over names in the APNI. -It identifies string patterns in input names that suggest a name can only be aligned to a genus -(hybrids that are not in the APC/ANI; graded species; taxa not identified to species), -and indicates these names only have a genus-rank match. +For a list of Australian plant names, find taxonomic or scientific name +alignments to the APC or APNI through standardizing formatting and fixing +spelling errors. + +Usage case: Users will run this function if they wish to see the details +of the matching algorithms, the many output columns that the matching +function compares to as it seeks the best alignment. They may also select +this function if they want to adjust the “fuzziness” level for fuzzy +matches, options not allowed in create_taxonomic_update_lookup. This +function is the first half of create_taxonomic_update_lookup. +} +\details{ +\itemize{ +\item This function finds taxonomic alignments in APC or scientific name +alignments in APNI. +\item It uses the internal function \code{match_taxa} to attempt to match input +strings to taxon names in the APC/APNI. +\item It sequentially searches for matches against more than 20 different string +patterns, prioritising exact matches (to accepted names as well as +synonyms, orthographic variants) over fuzzy matches. +\item It prioritises matches to taxa in the APC over names in the APNI. +\item It identifies string patterns in input names that suggest a name can only +be aligned to a genus (hybrids that are not in the APC/ANI; graded species; +taxa not identified to species), and indicates these names only have a +genus-rank match. +} + +Notes: +\itemize{ +\item If you will be running the function APCalign::create_taxonomic_update_lookup +many times, it is best to load the taxonomic resources separately using +resources <- load_taxonomic_resources(), then add the argument +resources = resources +\item The name Banksia cerrata does not align as the fuzzy matching algorithm +does not allow the first letter of the genus and species epithet to change. +\item With this function you have the option of changing the fuzzy matching +parameters. The defaults, with fuzzy matches only allowing changes of 3 +(or fewer) characters AND 20\% (or less) of characters has been carefully +calibrated to catch just about all typos, but very, very rarely mis-align +a name. If you wish to introduce less conservative fuzzy matching it is +recommended you manually check the aligned names. +\item It is recommended that you begin with imprecise_fuzzy_matches = FALSE (the +default), as quite a few of the less precise fuzzy matches are likely to be +erroneous. This argument should be turned on only if you plan to check all +alignments manually. +\item The argument identifier allows you to add a fix text string to all genus- +and family- level names, such as identifier = "Royal NP" would return "Acacia +sp. [Royal NP]". +} } \examples{ -\donttest{align_taxa(c("Poa annua", "Abies alba"))} +\donttest{ +resources <- load_taxonomic_resources() + +# example 1 +align_taxa(c("Poa annua", "Abies alba"), resources = resources) + +# example 2 +input <- c("Banksia serrata", "Banksia serrate", "Banksia cerrata", +"Banksia serrrrata", "Dryandra sp.", "Banksia big red flowers") + +aligned_taxa <- + APCalign::align_taxa( + original_name = input, + identifier = "APCalign test", + full = TRUE, + resources = resources + ) + +} + } \seealso{ diff --git a/man/create_species_state_origin_matrix.Rd b/man/create_species_state_origin_matrix.Rd index ed019678..d427bdd5 100644 --- a/man/create_species_state_origin_matrix.Rd +++ b/man/create_species_state_origin_matrix.Rd @@ -2,17 +2,23 @@ % Please edit documentation in R/create_species_state_origin_matrix.R \name{create_species_state_origin_matrix} \alias{create_species_state_origin_matrix} -\title{Use the taxon distribution data from the APC to determine state level native and introduced origin status} +\title{State level native and introduced origin status} \usage{ create_species_state_origin_matrix(resources = load_taxonomic_resources()) } \arguments{ -\item{resources}{the taxonomic resources required to make the summary statistics. Loading this can be slow, so call load_taxonomic_resources separately to greatly speed this function up and pass the resources in.} +\item{resources}{the taxonomic resources required to make the summary statistics. +Loading this can be slow, so call load_taxonomic_resources separately to greatly +speed this function up and pass the resources in.} } \value{ -A tibble with columns representing each state and rows representing each species. The values in each cell represent the origin of the species in that state. +A tibble with columns representing each state and rows representing each +species. The values in each cell represent the origin of the species in that state. } \description{ +This function uses the taxon distribution data from the APC to determine +state level native and introduced origin status. + This function processes the geographic data available in the APC and returns state level native, introduced and more complicated origins status for all taxa. } diff --git a/man/create_taxonomic_update_lookup.Rd b/man/create_taxonomic_update_lookup.Rd index 0ece0bd6..e3ae8743 100644 --- a/man/create_taxonomic_update_lookup.Rd +++ b/man/create_taxonomic_update_lookup.Rd @@ -2,7 +2,8 @@ % Please edit documentation in R/create_taxonomic_update_lookup.R \name{create_taxonomic_update_lookup} \alias{create_taxonomic_update_lookup} -\title{Create a lookup table with the best-possible scientific name match for a list of Australian plant names} +\title{Create a table with the best-possible scientific name match for +Australian plant names} \usage{ create_taxonomic_update_lookup( taxa, @@ -10,72 +11,188 @@ create_taxonomic_update_lookup( version = default_version(), taxonomic_splits = "most_likely_species", full = FALSE, + fuzzy_abs_dist = 3, + fuzzy_rel_dist = 0.2, + fuzzy_matches = TRUE, APNI_matches = TRUE, imprecise_fuzzy_matches = FALSE, identifier = NA_character_, resources = load_taxonomic_resources(), + quiet = FALSE, output = NULL ) } \arguments{ -\item{taxa}{A list of Australian plant species that needs to be reconciled with current taxonomy.} +\item{taxa}{A list of Australian plant species that needs to be reconciled +with current taxonomy.} -\item{stable_or_current_data}{either "stable" for a consistent version, or "current" for the leading edge version.} +\item{stable_or_current_data}{either "stable" for a consistent version, +or "current" for the leading edge version.} \item{version}{The version number of the dataset to use.} -\item{taxonomic_splits}{How to handle one_to_many taxonomic matches. Default is "return_all". The other options are "collapse_to_higher_taxon" and "most_likely_species". most_likely_species defaults to the original_name if that name is accepted by the APC; this will be right for certain species subsets, but make errors in other cases, use with caution.} +\item{taxonomic_splits}{How to handle one_to_many taxonomic matches. +Default is "return_all". The other options are "collapse_to_higher_taxon" +and "most_likely_species". most_likely_species defaults to the original_name +if that name is accepted by the APC; this will be right for certain species +subsets, but make errors in other cases, use with caution.} -\item{full}{logical for whether the full lookup table is returned or just key columns} +\item{full}{logical for whether the full lookup table is returned or +just key columns} -\item{APNI_matches}{Name matches to the APNI (Australian Plant Names Index) are turned off as a default.} +\item{fuzzy_abs_dist}{The number of characters allowed to be different for +a fuzzy match.} -\item{imprecise_fuzzy_matches}{Imprecise fuzzy matches are turned off as a default.} +\item{fuzzy_rel_dist}{The proportion of characters allowed to be different +for a fuzzy match.} -\item{identifier}{A dataset, location or other identifier, which defaults to NA.} +\item{fuzzy_matches}{Fuzzy matches are turned on as a default. The relative +and absolute distances allowed for fuzzy matches to species and +infraspecific taxon names are defined by the parameters \code{fuzzy_abs_dist} +and \code{fuzzy_rel_dist}.} -\item{resources}{These are the taxonomic resources used for cleaning, this will default to loading them from a local place on your computer. If this is to be called repeatedly, it's much faster to load the resources using \code{\link{load_taxonomic_resources}} separately and pass the data in.} +\item{APNI_matches}{Name matches to the APNI (Australian Plant Names Index) +are turned off as a default.} -\item{output}{file path to save the intermediate output to} +\item{imprecise_fuzzy_matches}{Imprecise fuzzy matches uses the fuzzy +matching function with lenient levels set (absolute distance of +5 characters; relative distance = 0.25). +It offers a way to get a wider range of possible names, possibly +corresponding to very distant spelling mistakes. +This is FALSE as default and all outputs should be checked as it often +makes erroneous matches.} + +\item{identifier}{A dataset, location or other identifier, +which defaults to NA.} + +\item{resources}{These are the taxonomic resources used for cleaning, this +will default to loading them from a local place on your computer. If this is +to be called repeatedly, it's much faster to load the resources using +\code{\link{load_taxonomic_resources}} separately and pass the data in.} + +\item{quiet}{Logical to indicate whether to display messages while +aligning taxa.} + +\item{output}{file path to save the output. If this file already exists, +this function will check if it's a subset of the species passed in and try +to add to this file. This can be useful for large and growing projects.} } \value{ -A lookup table containing the accepted and suggested names for each original name input, and additional taxonomic information such as taxon rank, taxonomic status, taxon IDs and genera. +A lookup table containing the accepted and suggested names for each +original name input, and additional taxonomic information such as taxon +rank, taxonomic status, taxon IDs and genera. \itemize{ \item original_name: the original plant name. -\item aligned_name: the input plant name that has been aligned to a taxon name in the APC or APNI by the align_taxa function. +\item aligned_name: the input plant name that has been aligned to a taxon name in +the APC or APNI by the align_taxa function. \item accepted_name: the APC-accepted plant name, when available. -\item suggested_name: the suggested plant name to use. Identical to the accepted_name, when an accepted_name exists; otherwise the the suggested_name is the aligned_name. -\item genus: the genus of the accepted (or suggested) name; only APC-accepted genus names are filled in. -\item family: the family of the accepted (or suggested) name; only APC-accepted family names are filled in. +\item suggested_name: the suggested plant name to use. Identical to the +accepted_name, when an accepted_name exists; +otherwise the the suggested_name is the aligned_name. +\item genus: the genus of the accepted (or suggested) name; +only APC-accepted genus names are filled in. +\item family: the family of the accepted (or suggested) name; +only APC-accepted family names are filled in. \item taxon_rank: the taxonomic rank of the suggested (and accepted) name. -\item taxonomic_dataset: the source of the suggested (and accepted) names (APC or APNI). +\item taxonomic_dataset: the source of the suggested (and accepted) names +(APC or APNI). \item taxonomic_status: the taxonomic status of the suggested (and accepted) name. -\item taxonomic_status_aligned: the taxonomic status of the aligned name, before any taxonomic updates have been applied. -\item aligned_reason: the explanation of a specific taxon name alignment (from an original name to an aligned name). -\item update_reason: the explanation of a specific taxon name update (from an aligned name to an accepted or suggested name). +\item taxonomic_status_aligned: the taxonomic status of the aligned name, +before any taxonomic updates have been applied. +\item aligned_reason: the explanation of a specific taxon name alignment +(from an original name to an aligned name). +\item update_reason: the explanation of a specific taxon name update +(from an aligned name to an accepted or suggested name). \item subclass: the subclass of the accepted name. -\item taxon_distribution: the distribution of the accepted name; only filled in if an APC accepted_name is available. -\item scientific_name_authorship: the authorship information for the accepted (or synonymous) name; available for both APC and APNI names. -\item taxon_ID: the unique taxon concept identifier for the accepted_name; only filled in if an APC accepted_name is available. -\item taxon_ID_genus: an identifier for the genus; only filled in if an APC-accepted genus name is available. -\item scientific_name_ID: an identifier for the nomenclatural (not taxonomic) details of a scientific name; available for both APC and APNI names. +\item taxon_distribution: the distribution of the accepted name; +only filled in if an APC accepted_name is available. +\item scientific_name_authorship: the authorship information for the accepted +(or synonymous) name; available for both APC and APNI names. +\item taxon_ID: the unique taxon concept identifier for the accepted_name; +only filled in if an APC accepted_name is available. +\item taxon_ID_genus: an identifier for the genus; +only filled in if an APC-accepted genus name is available. +\item scientific_name_ID: an identifier for the nomenclatural (not taxonomic) +details of a scientific name; available for both APC and APNI names. \item row_number: the row number of a specific original_name in the input. -\item number_of_collapsed_taxa: when taxonomic_splits == "collapse_to_higher_taxon", the number of possible taxon names that have been collapsed. +\item number_of_collapsed_taxa: when taxonomic_splits == "collapse_to_higher_taxon", +the number of possible taxon names that have been collapsed. } } \description{ -This function takes a list of Australian plant names that need to be reconciled with current taxonomy and -generates a lookup table of the best-possible scientific name match for each input name. -It uses first the function \code{align_taxa}, then the function \code{update_taxonomy} to achieve the output. +This function takes a list of Australian plant names that need to be +reconciled with current taxonomy and generates a lookup table of the +best-possible scientific name match for each input name. + +Usage case: This is APCalign’s core function, merging together the alignment +and updating of taxonomy. +} +\details{ +\itemize{ +\item It uses first the function \code{align_taxa}, then the function \code{update_taxonomy} +to achieve the output. The aligned name is plant name that has been aligned +to a taxon name in the APC or APNI by the align_taxa function. +} + +Notes: +\itemize{ +\item If you will be running the function APCalign::create_taxonomic_update_lookup +many times, it is best to load the taxonomic resources separately using +\code{resources <- load_taxonomic_resources()}, then add the argument +resources = resources +\item The name Banksia cerrata does not align as the fuzzy matching algorithm +does not allow the first letter of the genus and species epithet to change. +\item The argument taxonomic_splits allows you to choose the outcome for updating +the names of taxa with ambiguous taxonomic histories; this applies to +scientific names that were once attached to a more broadly circumscribed +taxon concept, that was then split into several more narrowly circumscribed +taxon concepts, one of which retains the original name. There are three +options: most_likely_species returns the name that is retained, with +alternative names documented in square brackets; return_all adds additional +rows to the output, one for each possible taxon concept; +collapse_to_higher_taxon returns the genus with possible names in square +brackets. +\item The argument identifier allows you to add a fix text string to all genus- +and family- level names, such as identifier = "Royal NP" would return +\verb{Acacia sp. \[Royal NP]}. +} } \examples{ -\donttest{resources <- load_taxonomic_resources() +\donttest{ +resources <- load_taxonomic_resources() + +# example 1 create_taxonomic_update_lookup(c("Eucalyptus regnans", "Acacia melanoxylon", "Banksia integrifolia", "Not a species"), - resources=resources) + resources = resources) + +# example 2 +input <- c("Banksia serrata", "Banksia serrate", "Banksia cerrata", +"Banksea serrata", "Banksia serrrrata", "Dryandra") + +create_taxonomic_update_lookup( + taxa = input, + identifier = "APCalign test", + full = TRUE, + resources = resources + ) + +# example 3 +taxon_list <- + readr::read_csv( + system.file("extdata", "test_taxa.csv", package = "APCalign"), + show_col_types = FALSE) + +create_taxonomic_update_lookup( + taxa = taxon_list$original_name, + identifier = taxon_list$notes, + full = TRUE, + resources = resources + ) } + } \seealso{ \code{\link{load_taxonomic_resources}} diff --git a/man/default_version.Rd b/man/default_version.Rd new file mode 100644 index 00000000..c0e7e100 --- /dev/null +++ b/man/default_version.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/load_taxonomic_resources.R +\name{default_version} +\alias{default_version} +\title{Get the default version for stable data} +\usage{ +default_version() +} +\value{ +A character string representing the default version for stable data. +} +\description{ +This function returns the default version for stable data, which is used when no +version is specified. +} diff --git a/man/load_taxonomic_resources.Rd b/man/load_taxonomic_resources.Rd index cfab6cc3..d3d12eba 100644 --- a/man/load_taxonomic_resources.Rd +++ b/man/load_taxonomic_resources.Rd @@ -2,32 +2,45 @@ % Please edit documentation in R/load_taxonomic_resources.R \name{load_taxonomic_resources} \alias{load_taxonomic_resources} -\title{Load taxonomic resources from either stable or current versions of APC and APNI} +\title{Load taxonomic reference lists, APC & APNI} \usage{ load_taxonomic_resources( stable_or_current_data = "stable", version = default_version(), - reload = FALSE + quiet = FALSE ) } \arguments{ -\item{stable_or_current_data}{Type of dataset to access. The default is "stable", which loads the -dataset from a github archived file. If set to "current", the dataset will be loaded from -a URL which is the cutting edge version, but this may change at any time without notice.} +\item{stable_or_current_data}{Type of dataset to access. +The default is "stable", which loads the dataset from a github archived file. +If set to "current", the dataset will be loaded from a URL which is the +cutting edge version, but this may change at any time without notice.} -\item{version}{The version number of the dataset to use. Defaults to the default version.} +\item{version}{The version number of the dataset to use. +Defaults to the default version.} -\item{reload}{A logical indicating whether to reload the dataset from the data source. Defaults to FALSE.} +\item{quiet}{A logical indicating whether to print status of loading to screen. +Defaults to FALSE.} } \value{ The taxonomic resources data loaded into the global environment. } \description{ -This function loads two taxonomic datasets for Australia's vascular plants, the APC and APNI, into the global environment. -It accesses taxonomic data from a dataset using the provided version number or the default version. -The function creates several data frames by filtering and selecting data from the loaded lists. +This function loads two taxonomic datasets for Australia's vascular plants, +the APC and APNI, into the global environment. It creates several data frames +by filtering and selecting data from the loaded lists. +} +\details{ +\itemize{ +\item It accesses taxonomic data from a dataset using the provided version number +or the default version. +\item The output is several dataframes that include subsets of the APC/APNI based +on taxon rank and taxonomic status. +} } \examples{ -\donttest{load_taxonomic_resources(stable_or_current_data="stable",version="0.0.2.9000")} +\donttest{ +load_taxonomic_resources(stable_or_current_data="stable", +version="0.0.2.9000")} } diff --git a/man/native_anywhere_in_australia.Rd b/man/native_anywhere_in_australia.Rd index 2e9e6cd4..f4e63c25 100644 --- a/man/native_anywhere_in_australia.Rd +++ b/man/native_anywhere_in_australia.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/native_anywhere_in_australia.R \name{native_anywhere_in_australia} \alias{native_anywhere_in_australia} -\title{For a vector of taxon names in to the APC, check if the species are native anywhere in Australia} +\title{Native anywhere in Australia} \usage{ native_anywhere_in_australia(species, resources = load_taxonomic_resources()) } @@ -10,18 +10,30 @@ native_anywhere_in_australia(species, resources = load_taxonomic_resources()) \item{species}{A character string typically representing the binomial for the species.} \item{resources}{An optional list of taxonomic resources to use for the lookup. -If not provided, the function will load default taxonomic resources using the \code{load_taxonomic_resources()} function.} +If not provided, the function will load default taxonomic resources using the +\code{load_taxonomic_resources()} function.} } \value{ -A tibble with two columns: \code{species}, which is the same as the unique values of the input \code{species}, -and \code{native_anywhere_in_aus}, a vector indicating whether each species is native anywhere in Australia, introduced by humans from elsewhere, or unknown with respect to the APC resource. +A tibble with two columns: \code{species}, which is the same as the unique values of +the input \code{species}, and \code{native_anywhere_in_aus}, a vector indicating whether each +species is native anywhere in Australia, introduced by humans from elsewhere, or +unknown with respect to the APC resource. } \description{ -This function checks if the given species is native anywhere in Australia according to the APC. -Note that this will not detect within-Australia introductions, e.g. if a species is from Western Australia and is invasive on the east coast. -And recent invasions are unlikely to be documented yet in APC. -For the complete matrix of species by states that also represents within-Australia invasions, -use \link{create_species_state_origin_matrix}. For spelling checks and taxonomy updates please see \link{create_taxonomic_update_lookup}. +This function checks which species from a list is thought to be native anywhere in +Australia according to the APC. +} +\details{ +Important caveats: +\itemize{ +\item This function will not detect within-Australia introductions, +e.g. if a species is from Western Australia and is invasive on the east coast. +\item Very recent invasions are unlikely to be documented yet in APC. +\item Ideally check spelling and taxonomy updates first via +\link{create_taxonomic_update_lookup}. +\item For the complete matrix of species by states that also represents +within-Australia invasions, use \link{create_species_state_origin_matrix}. +} } \examples{ \donttest{native_anywhere_in_australia(c("Eucalyptus globulus","Pinus radiata","Banksis notaspecies"))} diff --git a/man/reexports.Rd b/man/reexports.Rd new file mode 100644 index 00000000..22300d28 --- /dev/null +++ b/man/reexports.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/reexports.R +\docType{import} +\name{reexports} +\alias{reexports} +\alias{\%>\%} +\title{Objects exported from other packages} +\keyword{internal} +\description{ +These objects are imported from other packages. Follow the links +below to see their documentation. + +\describe{ + \item{dplyr}{\code{\link[dplyr:reexports]{\%>\%}}} +}} + diff --git a/man/standardise_names.Rd b/man/standardise_names.Rd index fc691262..7a9ad0b5 100644 --- a/man/standardise_names.Rd +++ b/man/standardise_names.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/standardise_names.R \name{standardise_names} \alias{standardise_names} -\title{Standardises taxon names by performing a series of text substitutions to remove common inconsistencies in taxonomic nomenclature.} +\title{Standardise taxon names} \usage{ standardise_names(taxon_names) } @@ -13,10 +13,22 @@ standardise_names(taxon_names) A character vector of standardised taxon names. } \description{ +Standardises taxon names by performing a series of text substitutions to +remove common inconsistencies in taxonomic nomenclature. + The function takes a character vector of taxon names as input and -returns a character vector of taxon names using standardised taxonomic syntax as output. -In particular it standardises taxon rank abbreviations and qualifiers (subsp., var., f.), as people use many variants of these terms. -It also standardises or removes a few additional filler words used within taxon names (affinis becomes aff.; s.l. and s.s. are removed). +returns a character vector of taxon names using standardised taxonomic syntax +as output. +} +\details{ +\itemize{ +\item It removes stray punctuation at the start and end of a character string. +\item It standardises unusual characters and symbols to ASCII equivalents. +\item It standardises taxon rank abbreviations and qualifiers (subsp., var., f.), +as people use many variants of these terms. +\item It standardises or removes a few additional filler words used within +taxon names (affinis becomes aff.; s.l. and s.s. are removed). +} } \examples{ standardise_names(c("Quercus suber", diff --git a/man/standardise_taxon_rank.Rd b/man/standardise_taxon_rank.Rd new file mode 100644 index 00000000..23af4949 --- /dev/null +++ b/man/standardise_taxon_rank.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/standardise_names.R +\name{standardise_taxon_rank} +\alias{standardise_taxon_rank} +\title{Standardise taxon ranks} +\usage{ +standardise_taxon_rank(taxon_rank) +} +\arguments{ +\item{taxon_rank}{A character vector of Latin taxon ranks.} +} +\value{ +A character vector of English taxon ranks. +} +\description{ +Standardise taxon ranks from Latin into English. +} +\details{ +The function takes a character vector of Latin taxon ranks as input and +returns a character vector of taxon ranks using standardised English terms. +} +\examples{ +standardise_taxon_rank(c("regnum", "kingdom", "classis", "class")) +} diff --git a/man/state_diversity_counts.Rd b/man/state_diversity_counts.Rd index 9f2e3f68..1d5f0332 100644 --- a/man/state_diversity_counts.Rd +++ b/man/state_diversity_counts.Rd @@ -2,22 +2,31 @@ % Please edit documentation in R/state_diversity_counts.R \name{state_diversity_counts} \alias{state_diversity_counts} -\title{For Australian states and territories, use data from the APC to calculate state-level diversity for native, introduced, and more complicated species origins} +\title{State- and territory-level diversity} \usage{ state_diversity_counts(state, resources = load_taxonomic_resources()) } \arguments{ -\item{state}{A character string indicating the Australian state or territory to calculate the diversity for. Possible values are "NSW", "NT", "Qld", "WA", "ChI", "SA", "Vic", "Tas", "ACT", "NI", "LHI", "MI", "HI", "MDI", "CoI", "CSI", and "AR".} +\item{state}{A character string indicating the Australian state or +territory to calculate the diversity for. Possible values are "NSW", "NT", +"Qld", "WA", "ChI", "SA", "Vic", "Tas", "ACT", "NI", "LHI", "MI", "HI", +"MDI", "CoI", "CSI", and "AR".} -\item{resources}{the taxonomic resources required to make the summary statistics. loading this can be slow, so call load_taxonomic_resources separately to greatly speed this function up and pass the resources in.} +\item{resources}{the taxonomic resources required to make the summary +statistics. loading this can be slow, so call load_taxonomic_resources +separately to greatly speed this function up and pass the resources in.} } \value{ -A tibble of diversity counts for the specified state or territory, including native, introduced, and more complicated species origins. -The tibble has three columns: "origin" indicating the origin of the species, "state" indicating the Australian state or territory, and "num_species" indicating the number of species for that origin and state. +A tibble of diversity counts for the specified state or territory, +including native, introduced, and more complicated species origins. +The tibble has three columns: "origin" indicating the origin of the +species, "state" indicating the Australian state or territory, and +"num_species" indicating the number of species for that origin and state. } \description{ -This function calculates state-level diversity for native, introduced, and more complicated species origins -based on the geographic data available in the APC. +For Australian states and territories, use geographic distribution data from +the APC to calculate state-level diversity for native, introduced, +and more complicated species origins } \examples{ \donttest{state_diversity_counts(state = "NSW")} diff --git a/man/strip_names.Rd b/man/strip_names.Rd index 459288c4..ea26df1e 100644 --- a/man/strip_names.Rd +++ b/man/strip_names.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/strip_names.R \name{strip_names} \alias{strip_names} -\title{Strip taxonomic names of taxon rank abbreviations and qualifiers and special characters} +\title{Strip taxon names} \usage{ strip_names(taxon_names) } @@ -10,13 +10,23 @@ strip_names(taxon_names) \item{taxon_names}{A character vector of taxonomic names to be stripped.} } \value{ -A character vector of stripped taxonomic names, with subtaxa designations, special -characters, and extra whitespace removed, and all letters converted to lowercase. +A character vector of stripped taxonomic names, +with subtaxa designations, special characters, and extra whitespace +removed, and all letters converted to lowercase. } \description{ -Given a vector of taxonomic names, this function removes subtaxa designations ("subsp.", "var.", "f.", and "ser"), -special characters (e.g., "-", ".", "(", ")", "?"), and extra whitespace. The resulting vector -of names is also converted to lowercase. +Strip taxonomic names of taxon rank abbreviations and qualifiers +and special characters +} +\details{ +Given a vector of taxonomic names, this function removes: +\itemize{ +\item subtaxa designations ("subsp.", "var.", "f.", and "ser") +\item special characters (e.g., "-", ".", "(", ")", "?") +\item extra whitespace +} + +The resulting vector of names is also converted to lowercase. } \examples{ strip_names(c("Abies lasiocarpa subsp. lasiocarpa", diff --git a/man/strip_names_2.Rd b/man/strip_names_2.Rd deleted file mode 100644 index 2812d9bd..00000000 --- a/man/strip_names_2.Rd +++ /dev/null @@ -1,29 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/strip_names.R -\name{strip_names_2} -\alias{strip_names_2} -\title{Strip taxonomic names of taxon rank abbreviations and qualifiers, filler words and special characters} -\usage{ -strip_names_2(taxon_names) -} -\arguments{ -\item{taxon_names}{A character vector of taxonomic names to be stripped.} -} -\value{ -A character vector of stripped taxonomic names, with subtaxa designations, special -characters, additional filler words and extra whitespace removed, and all letters converted to lowercase. -} -\description{ -Given a vector of taxonomic names, this function removes subtaxa designations ("subsp.", "var.", "f.", and "ser"), -additional filler words and characters (" x " for hybrid taxa, "sp."), -special characters (e.g., "-", ".", "(", ")", "?"), and extra whitespace. The resulting vector -of names is also converted to lowercase. -} -\examples{ -strip_names_2(c("Abies lasiocarpa subsp. lasiocarpa", - "Quercus kelloggii", - "Pinus contorta var. latifolia", - "Acacia sp.", - "Lepidium sp. Tanguin Hill (K.R.Newbey 10501)")) - -} diff --git a/man/strip_names_extra.Rd b/man/strip_names_extra.Rd new file mode 100644 index 00000000..ff26a3ca --- /dev/null +++ b/man/strip_names_extra.Rd @@ -0,0 +1,32 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/strip_names.R +\name{strip_names_extra} +\alias{strip_names_extra} +\title{Strip taxon names, extra} +\usage{ +strip_names_extra(taxon_names) +} +\arguments{ +\item{taxon_names}{A character vector of taxonomic names to be stripped.} +} +\value{ +A character vector of stripped taxonomic names, +with \code{sp.} and hybrid symbols removed. +} +\description{ +Strip taxonomic names of \code{sp.} and hybrid symbols. This function assumes +that a character function has already been run through \code{strip_names}. +} +\details{ +Given a vector of taxonomic names, this function removes additional filler +words (" x " for hybrid taxa, "sp.") not removed by the function +\code{strip_names} +} +\examples{ +strip_names_extra(c("Abies lasiocarpa subsp. lasiocarpa", + "Quercus kelloggii", + "Pinus contorta var. latifolia", + "Acacia sp.", + "Lepidium sp. Tanguin Hill (K.R.Newbey 10501)")) + +} diff --git a/man/update_taxonomy.Rd b/man/update_taxonomy.Rd index 4a5a84ad..cf9804c6 100644 --- a/man/update_taxonomy.Rd +++ b/man/update_taxonomy.Rd @@ -2,73 +2,130 @@ % Please edit documentation in R/update_taxonomy.R \name{update_taxonomy} \alias{update_taxonomy} -\title{For a list of taxon names aligned to the APC, update the name to an accepted taxon concept per the APC and add scientific name and taxon concept metadata to names aligned to either the APC or APNI.} +\title{Update to currently accepted APC name and add APC/APNI name metadata} \usage{ update_taxonomy( aligned_data, taxonomic_splits = "most_likely_species", + quiet = TRUE, output = NULL, resources = load_taxonomic_resources() ) } \arguments{ -\item{aligned_data}{A tibble of plant names to update. This table must include 5 columns, original_name, aligned_name, taxon_rank, taxonomic_dataset, and aligned_reason. +\item{aligned_data}{A tibble of plant names to update. This table must +include 5 columns, original_name, aligned_name, taxon_rank, +taxonomic_dataset, and aligned_reason. These columns are created by the function \code{align_taxa}. -The columns \code{original_name} and \code{aligned_name} must be in the format of the scientific name, with genus and species, -and may contain additional qualifiers such as subspecies or varieties. The names are case insensitive.} +The columns \code{original_name} and \code{aligned_name} must be in the format of the +scientific name, with genus and species, +and may contain additional qualifiers such as subspecies or varieties. The +names are case insensitive.} -\item{taxonomic_splits}{Variable that determines what protocol to use to update taxon names that are ambiguous due to taxonomic splits. +\item{taxonomic_splits}{Variable that determines what protocol to use to +update taxon names that are ambiguous due to taxonomic splits. The three options are: -most_likely_species, which returns the species name in use before the split; alternative names are returned in a separate column -return_all, which returns all possible names -collapse_to_higher_taxon, which declares that an ambiguous name cannot be aligned to an accepted species/infraspecific name and the name is demoted to genus rank} +\itemize{ +\item \code{most_likely_species}, which returns the species name in use before the +split; alternative names are returned in a separate column +\item \code{return_all}, which returns all possible names +\item \code{collapse_to_higher_taxon}, which declares that an ambiguous name cannot +be aligned to an accepted species/infraspecific name and the name is +demoted to genus rank +}} + +\item{quiet}{Logical to indicate whether to display messages while updating +taxa.} -\item{output}{(optional) Name of the file where results are saved. The default is NULL and no file is created. -If specified, the output will be saved in a CSV file with the given name.} +\item{output}{(optional) Name of the file where results are saved. The +default is NULL and no file is created. If specified, the output will be +saved in a CSV file with the given name.} -\item{resources}{the taxonomic resources required to make the summary statistics. Loading this can be slow, so call load_taxonomic_resources separately to greatly speed this function up and pass the resources in.} +\item{resources}{the taxonomic resources required to make the summary +statistics. Loading this can be slow, so call load_taxonomic_resources +separately to greatly speed this function up and pass the resources in.} } \value{ -A tibble with updated taxonomy for the specified plant names. The tibble contains the following columns: +A tibble with updated taxonomy for the specified plant names. The +tibble contains the following columns: \itemize{ \item original_name: the original plant name. -\item aligned_name: the input plant name that has been aligned to a taxon name in the APC or APNI by the align_taxa function. +\item aligned_name: the input plant name that has been aligned to a taxon name +in the APC or APNI by the align_taxa function. \item accepted_name: the APC-accepted plant name, when available. -\item suggested_name: the suggested plant name to use. Identical to the accepted_name, when an accepted_name exists; otherwise the the suggested_name is the aligned_name. -\item genus: the genus of the accepted (or suggested) name; only APC-accepted genus names are filled in. -\item family: the family of the accepted (or suggested) name; only APC-accepted family names are filled in. +\item suggested_name: the suggested plant name to use. Identical to the +accepted_name, when an accepted_name exists; otherwise the the suggested_name +is the aligned_name. +\item genus: the genus of the accepted (or suggested) name; only APC-accepted +genus names are filled in. +\item family: the family of the accepted (or suggested) name; only APC-accepted +family names are filled in. \item taxon_rank: the taxonomic rank of the suggested (and accepted) name. -\item taxonomic_dataset: the source of the suggested (and accepted) names (APC or APNI). +\item taxonomic_dataset: the source of the suggested (and accepted) names (APC or +APNI). \item taxonomic_status: the taxonomic status of the suggested (and accepted) name. -\item taxonomic_status_aligned: the taxonomic status of the aligned name, before any taxonomic updates have been applied. -\item aligned_reason: the explanation of a specific taxon name alignment (from an original name to an aligned name). -\item update_reason: the explanation of a specific taxon name update (from an aligned name to an accepted or suggested name). +\item taxonomic_status_aligned: the taxonomic status of the aligned name, before +any taxonomic updates have been applied. +\item aligned_reason: the explanation of a specific taxon name alignment (from an +original name to an aligned name). +\item update_reason: the explanation of a specific taxon name update (from an +aligned name to an accepted or suggested name). \item subclass: the subclass of the accepted name. -\item taxon_distribution: the distribution of the accepted name; only filled in if an APC accepted_name is available. -\item scientific_name_authorship: the authorship information for the accepted (or synonymous) name; available for both APC and APNI names. -\item taxon_ID: the unique taxon concept identifier for the accepted_name; only filled in if an APC accepted_name is available. -\item taxon_ID_genus: an identifier for the genus; only filled in if an APC-accepted genus name is available. -\item scientific_name_ID: an identifier for the nomenclatural (not taxonomic) details of a scientific name; available for both APC and APNI names. +\item taxon_distribution: the distribution of the accepted name; only filled in +if an APC accepted_name is available. +\item scientific_name_authorship: the authorship information for the accepted +(or synonymous) name; available for both APC and APNI names. +\item taxon_ID: the unique taxon concept identifier for the accepted_name; only +filled in if an APC accepted_name is available. +\item taxon_ID_genus: an identifier for the genus; only filled in if an +APC-accepted genus name is available. +\item scientific_name_ID: an identifier for the nomenclatural (not taxonomic) +details of a scientific name; available for both APC and APNI names. \item row_number: the row number of a specific original_name in the input. -\item number_of_collapsed_taxa: when taxonomic_splits == "collapse_to_higher_taxon", the number of possible taxon names that have been collapsed. +\item number_of_collapsed_taxa: when taxonomic_splits == "collapse_to_higher_taxon", +the number of possible taxon names that have been collapsed. } } \description{ -This function uses the APC to update the taxonomy of names aligned to a taxon concept listed in the APC to the currently accepted name for the taxon concept. -The aligned_data data frame that is input must contain 5 columns, -\code{original_name}, \code{aligned_name}, \code{taxon_rank}, \code{taxonomic_dataset}, and \code{aligned_reason}. -The aligned name is a plant name that has been aligned to a taxon name in the APC or APNI by the align_taxa function. +For a list of taxon names aligned to the APC, update the name to an accepted +taxon concept per the APC and add scientific name and taxon concept metadata +to names aligned to either the APC or APNI. +} +\details{ +\itemize{ +\item This function uses the APC to update the taxonomy of names aligned to a +taxon concept listed in the APC to the currently accepted name for the taxon +concept. +\item The aligned_data data frame that is input must contain 5 columns, +\code{original_name}, \code{aligned_name}, \code{taxon_rank}, \code{taxonomic_dataset}, and +\code{aligned_reason}. (These are the columns output by the function \code{align_taxa}.) +\item The aligned name is a plant name that has been aligned to a taxon name in +the APC or APNI by the align_taxa function. +} + +Notes: +\itemize{ +\item As the input for this function is a table with 5 columns (output by +align_taxa), this function will only be used when you explicitly want to +separate the aligment and updating components of APCalign. This function is +the second half of create_taxonomic_update_lookup. +} } \examples{ # Update taxonomy for two plant names and print the result -\donttest{update_taxonomy( - tibble::tibble( +\donttest{ +resources <- load_taxonomic_resources() + +update_taxonomy( + dplyr::tibble( original_name = c("Dryandra preissii", "Banksia acuminata"), aligned_name = c("Dryandra preissii", "Banksia acuminata"), taxon_rank = c("species", "species"), taxonomic_dataset = c("APC", "APC"), - aligned_reason = NA_character_ - ) + aligned_reason = c(NA_character_, + NA_character_) + ), + resources = resources ) } } diff --git a/tests/testthat/benchmarks/standardise_names.csv b/tests/testthat/benchmarks/standardise_names.csv new file mode 100644 index 00000000..df28e2c0 --- /dev/null +++ b/tests/testthat/benchmarks/standardise_names.csv @@ -0,0 +1,43 @@ +taxon_names,standardised_names,genus,stripped_names,stripped_names_extra +(Dockrillia pugioniformis x Dockrillia striolata) x Dockrillia pugioniformis,Dockrillia pugioniformis x Dockrillia striolata) x Dockrillia pugioniformis,Dockrillia,dockrillia pugioniformis x dockrillia striolata x dockrillia pugioniformis,dockrillia pugioniformis dockrillia striolata dockrillia pugioniformis +Mesua sp. Boonjee,Mesua sp. Boonjee,Mesua,mesua sp boonjee,mesua boonjee +x Cynochloris macivorii,X Cynochloris macivorii,x Cynochloris,x cynochloris macivorii,x cynochloris macivorii +X Cynochloris macivorii,X Cynochloris macivorii,x Cynochloris,x cynochloris macivorii,x cynochloris macivorii +× Cynochloris macivorii,X Cynochloris macivorii,x Cynochloris,x cynochloris macivorii,x cynochloris macivorii +Xanthorrhoea macronema*,Xanthorrhoea macronema,Xanthorrhoea,xanthorrhoea macronema,xanthorrhoea macronema +CALYTRIX ALPESTRIS,CALYTRIX ALPESTRIS,Calytrix,calytrix alpestris,calytrix alpestris +calytrix ALPESTRIS,Calytrix ALPESTRIS,Calytrix,calytrix alpestris,calytrix alpestris +Centaurea × moncktonii,Centaurea x moncktonii,Centaurea,centaurea x moncktonii,centaurea moncktonii +Centaurea x moncktonii,Centaurea x moncktonii,Centaurea,centaurea x moncktonii,centaurea moncktonii +Thelymitra X irregularis,Thelymitra X irregularis,Thelymitra,thelymitra x irregularis,thelymitra irregularis +thelymitra X irregularis,Thelymitra X irregularis,Thelymitra,thelymitra x irregularis,thelymitra irregularis +Viola hederacea sensu Willis (1972),Viola hederacea sensu Willis (1972),Viola,viola hederacea sensu willis 1972,viola hederacea sensu willis 1972 +Cryptandra/Mirbelia sp.,Cryptandra / Mirbelia sp.,Cryptandra,cryptandra mirbelia sp,cryptandra mirbelia sp +Cryptandra∕Mirbelia sp.,Cryptandra / Mirbelia sp.,Cryptandra,cryptandra mirbelia sp,cryptandra mirbelia sp +?Xanthorrhoea macronema,Xanthorrhoea macronema,Xanthorrhoea,xanthorrhoea macronema,xanthorrhoea macronema +Pinus contorta var. latifolia,Pinus contorta var. latifolia,Pinus,pinus contorta latifolia,pinus contorta latifolia +Pinus contorta v latifolia,Pinus contorta var. latifolia,Pinus,pinus contorta latifolia,pinus contorta latifolia +Macrozamia preissii affinis dyeri,Macrozamia preissii aff. dyeri,Macrozamia,macrozamia preissii aff dyeri,macrozamia preissii aff dyeri +Macrozamia preissii affinis,Macrozamia preissii affinis,Macrozamia,macrozamia preissii affinis,macrozamia preissii affinis +Macrozamia preissii aff dyeri,Macrozamia preissii aff. dyeri,Macrozamia,macrozamia preissii aff dyeri,macrozamia preissii aff dyeri +Macrozamia preissii aff,Macrozamia preissii aff.,Macrozamia,macrozamia preissii aff,macrozamia preissii aff +Macrozamia preissii affin dyeri,Macrozamia preissii aff. dyeri,Macrozamia,macrozamia preissii aff dyeri,macrozamia preissii aff dyeri +Macrozamia preissii affin,Macrozamia preissii aff.,Macrozamia,macrozamia preissii aff,macrozamia preissii aff +Macrozamia preissii subsp. dyeri,Macrozamia preissii subsp. dyeri,Macrozamia,macrozamia preissii dyeri,macrozamia preissii dyeri +Macrozamia preissii ssp. dyeri,Macrozamia preissii subsp. dyeri,Macrozamia,macrozamia preissii dyeri,macrozamia preissii dyeri +Macrozamia preissii ssp dyeri,Macrozamia preissii subsp. dyeri,Macrozamia,macrozamia preissii dyeri,macrozamia preissii dyeri +Macrozamia preissii ss,Macrozamia preissii,Macrozamia,macrozamia preissii,macrozamia preissii +Macrozamia preissii sl,Macrozamia preissii,Macrozamia,macrozamia preissii,macrozamia preissii +Macrozamia preissii sensu stricto,Macrozamia preissii,Macrozamia,macrozamia preissii,macrozamia preissii +Macrozamia preissii sensu lato,Macrozamia preissii,Macrozamia,macrozamia preissii,macrozamia preissii +Macrozamia preissii ssensu lato,Macrozamia preissii ssensu lato,Macrozamia,macrozamia preissii ssensu lato,macrozamia preissii ssensu lato +Psychotria daphnoides f. 'small-leaved',Psychotria daphnoides f. 'small-leaved',Psychotria,psychotria daphnoides small leaved,psychotria daphnoides small leaved +Psychotria daphnoides forma 'small-leaved',Psychotria daphnoides f. 'small-leaved',Psychotria,psychotria daphnoides small leaved,psychotria daphnoides small leaved +Psychotria daphnoides form 'small-leaved',Psychotria daphnoides f. 'small-leaved',Psychotria,psychotria daphnoides small leaved,psychotria daphnoides small leaved +Psydrax odorata f. buxifolia,Psydrax odorata f. buxifolia,Psydrax,psydrax odorata buxifolia,psydrax odorata buxifolia +Billardiera ser. Parviflorae,Billardiera ser. Parviflorae,Billardiera,billardiera parviflorae,billardiera parviflorae +Billardiera series Parviflorae,Billardiera ser. Parviflorae,Billardiera,billardiera parviflorae,billardiera parviflorae +Hydrocotyle hirta var. pedicellosa,Hydrocotyle hirta var. pedicellosa,Hydrocotyle,hydrocotyle hirta pedicellosa,hydrocotyle hirta pedicellosa +Pterocaulon ciliosum x Pterocaulon serrulatum var. serrulatum,Pterocaulon ciliosum x Pterocaulon serrulatum var. serrulatum,Pterocaulon,pterocaulon ciliosum x pterocaulon serrulatum serrulatum,pterocaulon ciliosum pterocaulon serrulatum serrulatum +Tecticornia sp. Little Sandy Desert (K.A.Shepherd & C.Wilkins KS 830),Tecticornia sp. Little Sandy Desert (K.A.Shepherd & C.Wilkins KS 830),Tecticornia,tecticornia sp little sandy desert kashepherd cwilkins ks 830,tecticornia little sandy desert kashepherd cwilkins ks 830 +Pterostylis sp. Bloated snail orchid (W.Jackson BJ486),Pterostylis sp. Bloated snail orchid (W.Jackson BJ486),Pterostylis,pterostylis sp bloated snail orchid wjackson bj486,pterostylis bloated snail orchid wjackson bj486 diff --git a/tests/testthat/benchmarks/test_matches_alignments_updates.csv b/tests/testthat/benchmarks/test_matches_alignments_updates.csv index bf158aaa..6a42058b 100644 --- a/tests/testthat/benchmarks/test_matches_alignments_updates.csv +++ b/tests/testthat/benchmarks/test_matches_alignments_updates.csv @@ -31,28 +31,28 @@ Aporuelliaa abc--def,match_03c,match_03c,Aporuellia sp. [Aporuelliaa abc--def; t Driandra abc--def,match_03c,match_03c,Dryandra sp. [Driandra abc--def; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/instance/apni/865048,https://id.biodiversity.org.au/name/apni/77744,Dryandra R.Br. Xyystidium abc--def,match_03d,match_03d,Xystidium sp. [Xyystidium abc--def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. Zygiaa abc--def,match_03d,match_03d,Zygia sp. [Zygiaa abc--def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne -Abcde fgh -- ijk,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA -Ryandra abc--def,match_03e,match_03e,NA,NA,genus,NA,TRUE,NA,NA,NA +Abcde fgh -- ijk,NA,NA,NA,NA,NA,NA,TRUE,NA,NA,NA +Ryandra abc--def,match_03d,match_03d,Randia sp. [Ryandra abc--def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA Abildgaardia odontocarpa / Abildgaardia oxystachya,match_04a,match_04a,Abildgaardia sp. [Abildgaardia odontocarpa / Abildgaardia oxystachya; test_all_matches_TRUE],APC,genus,Abildgaardia,FALSE,https://id.biodiversity.org.au/node/apni/2905759,https://id.biodiversity.org.au/name/apni/55984,Abildgaardia Vahl Acanthocarpus fimbriatus / Acanthocarpus mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / Acanthocarpus mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm. Acanthocarpus fimbriatus / mucronatus,match_04a,match_04a,Acanthocarpus sp. [Acanthocarpus fimbriatus / mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm. Banksia serrata / Banksia ericifolia,match_04a,match_04a,Banksia sp. [Banksia serrata / Banksia ericifolia; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/taxon/apni/51445230,https://id.biodiversity.org.au/name/apni/105919,Banksia L.f. Banksia serrata / ericifolia,match_04a,match_04a,Banksia sp. [Banksia serrata / ericifolia; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/taxon/apni/51445230,https://id.biodiversity.org.au/name/apni/105919,Banksia L.f. -Banksia serrata/Banksia ericifolia,match_04a,match_04a,Banksia sp. [Banksia serrata/Banksia ericifolia; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/taxon/apni/51445230,https://id.biodiversity.org.au/name/apni/105919,Banksia L.f. +Banksia serrata/Banksia ericifolia,match_04a,match_04a,Banksia sp. [Banksia serrata / Banksia ericifolia; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/taxon/apni/51445230,https://id.biodiversity.org.au/name/apni/105919,Banksia L.f. Aporuellia abc / def,match_04a,match_04a,Aporuellia sp. [Aporuellia abc / def; test_all_matches_TRUE],APC,genus,Brunoniella,FALSE,https://id.biodiversity.org.au/instance/apni/903944,https://id.biodiversity.org.au/name/apni/97735,Aporuellia C.B.Clarke Dryandra abc / def,match_04a,match_04a,Dryandra sp. [Dryandra abc / def; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/instance/apni/865048,https://id.biodiversity.org.au/name/apni/77744,Dryandra R.Br. -Xystidium abc/def,match_04a,match_04a,Xystidium sp. [Xystidium abc/def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. -Zygia abc/def,match_04a,match_04a,Zygia sp. [Zygia abc/def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne +Xystidium abc/def,match_04a,match_04a,Xystidium sp. [Xystidium abc / def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. +Zygia abc/def,match_04a,match_04a,Zygia sp. [Zygia abc / def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne Abildgardiia odontocarpa / oxystachya,match_04b,match_04b,Abildgaardia sp. [Abildgardiia odontocarpa / oxystachya; test_all_matches_TRUE],APC,genus,Abildgaardia,FALSE,https://id.biodiversity.org.au/node/apni/2905759,https://id.biodiversity.org.au/name/apni/55984,Abildgaardia Vahl Accanthocarpis fimbriatus / Acanthocarpus mucronatus,match_04b,match_04b,Acanthocarpus sp. [Accanthocarpis fimbriatus / Acanthocarpus mucronatus; test_all_matches_TRUE],APC,genus,Acanthocarpus,FALSE,https://id.biodiversity.org.au/node/apni/2899190,https://id.biodiversity.org.au/name/apni/72610,Acanthocarpus Lehm. Bankseea serrata / ericifolia,match_04b,match_04b,Banksia sp. [Bankseea serrata / ericifolia; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/taxon/apni/51445230,https://id.biodiversity.org.au/name/apni/105919,Banksia L.f. Bannksia serrata / Banksia ericifolia,match_04b,match_04b,Banksia sp. [Bannksia serrata / Banksia ericifolia; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/taxon/apni/51445230,https://id.biodiversity.org.au/name/apni/105919,Banksia L.f. Aporuelliaa abc / def,match_04c,match_04c,Aporuellia sp. [Aporuelliaa abc / def; test_all_matches_TRUE],APC,genus,Brunoniella,FALSE,https://id.biodiversity.org.au/instance/apni/903944,https://id.biodiversity.org.au/name/apni/97735,Aporuellia C.B.Clarke Drrandra abc / def,match_04c,match_04c,Dryandra sp. [Drrandra abc / def; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/instance/apni/865048,https://id.biodiversity.org.au/name/apni/77744,Dryandra R.Br. -Xyystidium abc/def,match_04d,match_04d,Xystidium sp. [Xyystidium abc/def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. +Xyystidium abc/def,match_04d,match_04d,Xystidium sp. [Xyystidium abc / def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. Zygiaa abc / def,match_04d,match_04d,Zygia sp. [Zygiaa abc / def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne -Abcde fgh / ijk,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA -Ryandra abc / def,match_04e,match_04e,NA,NA,genus,NA,TRUE,NA,NA,NA +Abcde fgh / ijk,NA,NA,NA,NA,NA,NA,TRUE,NA,NA,NA +Ryandra abc / def,match_04d,match_04d,Randia sp. [Ryandra abc / def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA Cycas candida K.D.Hill,match_05a,match_01a,Cycas candida,APC,species,Cycas candida,TRUE,https://id.biodiversity.org.au/node/apni/2893335,https://id.biodiversity.org.au/name/apni/188177,Cycas candida K.D.Hill Eremophila papillata Chinnock,match_05a,match_01a,Eremophila papillata,APC,species,Eremophila papillata,TRUE,https://id.biodiversity.org.au/node/apni/2910890,https://id.biodiversity.org.au/name/apni/207453,Eremophila papillata Chinnock Acalypha indica var. australis F.M.Bailey,match_05b,match_01b,Acalypha indica var. australis,APC,variety,Acalypha lanceolata,TRUE,https://id.biodiversity.org.au/instance/apni/889946,https://id.biodiversity.org.au/name/apni/72588,Acalypha indica var. australis F.M.Bailey @@ -125,8 +125,8 @@ Aporuelliaa aff def,match_09c,match_06c,Aporuellia sp. [Aporuelliaa aff. def; te Drrandra affinis def,match_09c,match_06c,Dryandra sp. [Drrandra aff. def; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/instance/apni/865048,https://id.biodiversity.org.au/name/apni/77744,Dryandra R.Br. Xyystidium aff. abc,match_09d,match_06d,Xystidium sp. [Xyystidium aff. abc; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. Zygiaa aff. abc,match_09d,match_06d,Zygia sp. [Zygiaa aff. abc; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne -Abcde affinis fgh,match_09e,match_06e,NA,NA,genus,NA,TRUE,NA,NA,NA -Rryandra aff def,match_09e,match_06e,NA,NA,genus,NA,TRUE,NA,NA,NA +Abcde affinis fgh,NA,NA,NA,NA,NA,NA,TRUE,NA,NA,NA +Rryandra aff def,NA,NA,NA,NA,NA,NA,TRUE,NA,NA,NA Aceeena x ovinaaa,match_10a,match_07a,Acaena x ovina,APC,species,Acaena x ovina,FALSE,https://id.biodiversity.org.au/taxon/apni/51446291,https://id.biodiversity.org.au/name/apni/72209,Acaena x ovina A.Cunn. Banksiia serrratte,match_10a,match_07a,Banksia serrata,APC,species,Banksia serrata,TRUE,https://id.biodiversity.org.au/taxon/apni/51293610,https://id.biodiversity.org.au/name/apni/109014,Banksia serrata L.f. Eremoophila opppositifolia ssp. rubraaa,match_10a,match_07a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock @@ -154,8 +154,8 @@ Aporuelliaa abc x def,match_11c,match_08c,Aporuellia x [Aporuelliaa abc x def; t Drrandra x def,match_11c,match_08c,Dryandra x [Drrandra x def; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/instance/apni/865048,https://id.biodiversity.org.au/name/apni/77744,Dryandra R.Br. Xyystidium x def,match_11d,match_08d,Xystidium x [Xyystidium x def; test_all_matches_TRUE],APNI,genus,Xystidium,FALSE,NA,https://id.biodiversity.org.au/name/apni/244613,Xystidium Trin. Zygiaa abc x Zygia def,match_11d,match_08d,Zygia x [Zygiaa abc x Zygia def; test_all_matches_TRUE],APNI,genus,Zygia,FALSE,NA,https://id.biodiversity.org.au/name/apni/65077,Zygia P.Browne -Abcde fgh x ijk,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA -Ryandra abc x def,match_11e,match_08e,NA,NA,genus,NA,TRUE,NA,NA,NA +Abcde fgh x ijk,NA,NA,NA,NA,NA,NA,TRUE,NA,NA,NA +Ryandra abc x def,match_11d,match_11d,Randia x [Ryandra abc x def; test_all_matches_TRUE],APC,genus,Randia,FALSE,NA,NA,NA Baeckea sp. murchison river,match_12a,match_09a,Baeckea sp. Murchison River (M.E.Trudgen 12009),APC,species,Baeckea sp. Murchison River (M.E.Trudgen 12009),TRUE,https://id.biodiversity.org.au/node/apni/2888052,https://id.biodiversity.org.au/name/apni/191267,Baeckea sp. Murchison River (M.E.Trudgen 12009) WA Herbarium Eremophila oppositifolia rubra (needle leaves),match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock Eremophila oppositifolia rubra early collection,match_12a,match_09a,Eremophila oppositifolia subsp. rubra,APC,subspecies,Eremophila oppositifolia subsp. rubra,TRUE,https://id.biodiversity.org.au/node/apni/7951458,https://id.biodiversity.org.au/name/apni/117903,Eremophila oppositifolia subsp. rubra (C.T.White & W.D.Francis) Chinnock @@ -222,3 +222,6 @@ Drryandra,match_22b,match_12f,Dryandra sp. [Drryandra; test_all_matches_TRUE],AP Dryandraa,match_22b,match_12f,Dryandra sp. [Dryandraa; test_all_matches_TRUE],APC,genus,Banksia,FALSE,https://id.biodiversity.org.au/instance/apni/865048,https://id.biodiversity.org.au/name/apni/77744,Dryandra R.Br. Actiniladum sp.,NA,NA,NA,NA,NA,NA,TRUE,NA,NA,NA Ecalypha indica australis,NA,NA,NA,NA,NA,NA,TRUE,NA,NA,NA +Asteracee sp.,,,Asteraceae sp. [Asteracee sp.; test_all_matches_TRUE],APC,family,Asteraceae,FALSE,https://id.biodiversity.org.au/taxon/apni/51695393,https://id.biodiversity.org.au/name/apni/54580,Asteraceae Bercht. & J.Presl +Compositeae sp.,,,Compositae sp. [Compositeae sp.; test_all_matches_TRUE],APC,family,Asteraceae,FALSE,https://id.biodiversity.org.au/taxon/apni/51695393,https://id.biodiversity.org.au/name/apni/54580,Asteraceae Bercht. & J.Presl +Compositae sp.,,,Compositae sp. [Compositae sp.; test_all_matches_TRUE],APC,family,Asteraceae,FALSE,https://id.biodiversity.org.au/taxon/apni/51695393,https://id.biodiversity.org.au/name/apni/54580,Asteraceae Bercht. & J.Presl diff --git a/tests/testthat/test-connection.R b/tests/testthat/test-connection.R index 834bcf0c..1e630c79 100644 --- a/tests/testthat/test-connection.R +++ b/tests/testthat/test-connection.R @@ -1,4 +1,6 @@ test_that("Complains when network is down", { + skip_if_offline(host = "api.github.com") + Sys.setenv("NETWORK_UP" = FALSE) expect_message(default_version()) expect_message(dataset_access_function()) diff --git a/tests/testthat/test-functions-standardise_names.R b/tests/testthat/test-functions-standardise_names.R new file mode 100644 index 00000000..a8e9fc5b --- /dev/null +++ b/tests/testthat/test-functions-standardise_names.R @@ -0,0 +1,43 @@ +test_that("Extract genus", { + + taxa <- + c( + NA, + "Banksia integrifolia", + "Acacia longifolia", + "Commersonia rosea", + "Thelymitra pauciflora", + "Justicia procumbens", + "Hibbertia", + "Rostellularia long leaves", + "Hibbertia sericea var silliafolius", + "Hibbertia sp.", + "x Cynochloris macivorii", + "(Dockrillia pugioniformis x Dockrillia striolata) x Dockrillia pugioniformis" + ) + + expected <- c(NA, "Banksia", "Acacia", "Commersonia", "Thelymitra", + "Justicia", "Hibbertia", "Rostellularia", "Hibbertia", + "Hibbertia", "x Cynochloris", "Dockrillia") + out <- extract_genus(taxa) + expect_equal(out, expected) +}) + +test_that("Standardise names names", { + + expected <- + readr::read_csv("benchmarks/standardise_names.csv", show_col_types = FALSE) + + out <- + dplyr::tibble(taxon_names = expected$taxon_names, + standardised_names = standardise_names(taxon_names), + genus = extract_genus(standardised_names), + stripped_names = strip_names(standardised_names), + stripped_names_extra = strip_names_extra(stripped_names), + ) + #out %>% readr::write_csv("benchmarks/standardise_names.csv") + for(v in names(out)){ + expect_equal(out[[v]], expected[[v]], info=v) + } + +}) diff --git a/tests/testthat/test-functions-word.R b/tests/testthat/test-functions-word.R new file mode 100644 index 00000000..89e28213 --- /dev/null +++ b/tests/testthat/test-functions-word.R @@ -0,0 +1,24 @@ +test_that("Word", { + + taxa <- + c( + NA, + "Banksia integrifolia", + "Acacia longifolia", + "Commersonia rosea", + "Thelymitra pauciflora", + "Justicia procumbens", + "Hibbertia", + "Rostellularia long leaves", + "Hibbertia sericea var silliafolius", + "Hibbertia sp.", + "x Cynochloris macivorii", + "(Dockrillia pugioniformis x Dockrillia striolata) x Dockrillia pugioniformis" + ) + + expect_equal(APCalign:::word(taxa, 1), stringr::word(taxa, 1)) + expect_equal(APCalign:::word(taxa, 2), stringr::word(taxa, 2)) + expect_equal(APCalign:::word(taxa, 3), stringr::word(taxa, 3)) + expect_equal(APCalign:::word(taxa, 1,2), stringr::word(taxa, 1,2)) + expect_equal(APCalign:::word(taxa, 1,3), stringr::word(taxa, 1,3)) +}) diff --git a/tests/testthat/test-alignment_executes.R b/tests/testthat/test-operation_executes.R similarity index 79% rename from tests/testthat/test-alignment_executes.R rename to tests/testthat/test-operation_executes.R index f204b515..fdace06f 100644 --- a/tests/testthat/test-alignment_executes.R +++ b/tests/testthat/test-operation_executes.R @@ -28,7 +28,8 @@ test_that("create_taxonomic_update_lookup() returns more/less rows as requested" create_taxonomic_update_lookup( original_name, resources = resources, - taxonomic_splits = "most_likely_species" + taxonomic_splits = "most_likely_species", + quiet = TRUE ) expect_equal(out1$original_name, original_name) @@ -37,17 +38,19 @@ test_that("create_taxonomic_update_lookup() returns more/less rows as requested" create_taxonomic_update_lookup( original_name, resources = resources, - taxonomic_splits = "return_all" + taxonomic_splits = "return_all", + quiet = TRUE ) - # order and number of unqiue strings same as input + # order and number of unique strings same as input expect_equal(unique(out2$original_name), original_name) out3 <- create_taxonomic_update_lookup( original_name, resources = resources, - taxonomic_splits = "collapse_to_higher_taxon" + taxonomic_splits = "collapse_to_higher_taxon", + quiet = TRUE ) %>% dplyr::mutate( should_collapse = ifelse(number_of_collapsed_taxa > 1, TRUE, FALSE), @@ -66,9 +69,11 @@ test_that("align_taxa() executes - no/with fuzzy", { aligned_no_fuzzy <- c("Dryandra preissii", "Banksia acuminata", NA) out1 <- - align_taxa(original_name, resources = resources, fuzzy_matches = TRUE) + align_taxa(original_name, resources = resources, fuzzy_matches = TRUE, + quiet = TRUE) out2 <- - align_taxa(original_name, resources = resources, fuzzy_matches = FALSE) + align_taxa(original_name, resources = resources, fuzzy_matches = FALSE, + quiet = TRUE) expect_equal(original_name, out1$original_name) expect_equal(aligned_name, out1$aligned_name) @@ -77,12 +82,30 @@ test_that("align_taxa() executes - no/with fuzzy", { }) +test_that("quiet can be turned on and off", { + + original_name <- c("Dryandra preissii", "Banksia acuminata", "Bannksia accuminata") + + expect_silent( + out1 <- + align_taxa(original_name, resources = resources, fuzzy_matches = TRUE, + quiet = TRUE) + ) + + out1 <- + capture_messages(align_taxa(original_name, resources = resources, fuzzy_matches = TRUE, + quiet = FALSE)) + expect_true(length(out1) > 1) + +}) + test_that("align_taxa() executes with longer list", { species_list <- readr::read_csv(system.file("extdata", "species.csv", package = "APCalign"), show_col_types = FALSE) %>% dplyr::slice(1:50) - aligned_data <- align_taxa(species_list$name, resources = resources) + aligned_data <- align_taxa(species_list$name, resources = resources, + quiet = TRUE) expect_equal(nrow(aligned_data), 50) expect_equal(species_list$name, aligned_data$original_name) @@ -93,7 +116,7 @@ test_that("update_taxonomy() runs and prdouces suitable structure", { original_name <- c("Dryandra preissii", "Banksia acuminata") aligned_data <- - align_taxa(original_name, resources = resources) + align_taxa(original_name, resources = resources, quiet = TRUE) out1 <- update_taxonomy( @@ -108,7 +131,8 @@ test_that("update_taxonomy() runs and prdouces suitable structure", { out2 <- create_taxonomic_update_lookup( aligned_data$original_name, resources = resources, - taxonomic_splits = "most_likely_species" + taxonomic_splits = "most_likely_species", + quiet = TRUE ) v <- intersect(names(out1) , names(out2)) @@ -121,7 +145,7 @@ test_that("update_taxonomy() runs and prdouces suitable structure", { test_that("check runs with weird hybrid symbols", { original_name <- c("Platanus × acerifolia", "Platanus × hispanica") - out <- align_taxa(original_name, resources = resources) + out <- align_taxa(original_name, resources = resources, quiet = TRUE) expect_equal(standardise_names(original_name), out$cleaned_name) expect_equal(standardise_names(original_name), out$aligned_name) @@ -131,7 +155,7 @@ test_that("check runs with weird hybrid symbols", { test_that("handles NAs inn inputs", { original_name <- c("Acacia aneura", NA) - out1 <- align_taxa(original_name, resources = resources) + out1 <- align_taxa(original_name, resources = resources, quiet = TRUE) expect_equal(original_name, out1$original_name) @@ -139,13 +163,14 @@ test_that("handles NAs inn inputs", { create_taxonomic_update_lookup( original_name, taxonomic_splits = "most_likely_species", - resources = resources + resources = resources, + quiet = TRUE ) expect_equal(original_name, out2$original_name) expect_equal(original_name, out2$aligned_name) expect_equal(original_name, out2$accepted_name) - expect_equal(original_name[1], stringr::word(out2$suggested_name[1], start = 1, end = 2)) + expect_equal(original_name[1], word(out2$suggested_name[1], start = 1, end = 2)) }) @@ -162,7 +187,7 @@ test_that("handles weird strings", { ) out1 <- - align_taxa(test_strings, resources = resources) + align_taxa(test_strings, resources = resources, quiet = TRUE) expect_equal(test_strings, out1$original_name) @@ -170,7 +195,8 @@ test_that("handles weird strings", { create_taxonomic_update_lookup( test_strings, taxonomic_splits = "most_likely_species", - resources = resources) + resources = resources, + quiet = TRUE) expect_equal(nrow(out1), length(test_strings)) expect_equal(out1$original_name, test_strings) @@ -194,13 +220,14 @@ test_that("handles APNI taxa and genus level IDs",{ genus_updated <- c("Acacia", "Dendropanax", "Acanthopanax", "Eucalyptus") out1 <- - align_taxa(original_name, resources = resources) + align_taxa(original_name, resources = resources, quiet = TRUE) out2 <- create_taxonomic_update_lookup( original_name, taxonomic_splits = "most_likely_species", resources = resources, + quiet = TRUE, output = NULL) expect_equal(original_name, out1$original_name) @@ -214,7 +241,7 @@ test_that("handles APNI taxa and genus level IDs",{ expect_gte(nrow(out1), 4) - expect_false(any(str_detect(out2$suggested_name, "NA sp."))) + expect_false(any(stringr::str_detect(out2$suggested_name, "NA sp."))) expect_equal(out2$accepted_name, rep(NA_character_, nrow(out2))) }) @@ -225,7 +252,8 @@ test_that("Runs when neither taxa in in APC", { out <- create_taxonomic_update_lookup( taxa = original_name, - resources = resources, taxonomic_splits = "most_likely_species" + resources = resources, taxonomic_splits = "most_likely_species", + quiet = TRUE ) # output should be same order and length as input @@ -234,13 +262,17 @@ test_that("Runs when neither taxa in in APC", { test_that("no matches to APC accepted names are required", { # some genus matches - out1 <- create_taxonomic_update_lookup(taxa = c("Eucalyptus", "Banksia asdasd", "Ryandra sp"), resources = resources) + out1 <- create_taxonomic_update_lookup( + taxa = c("Eucalyptus", "Banksia asdasd", "Ryandra sp"), + resources = resources, quiet = TRUE) expect_equal(nrow(out1), 3) # all garbage - out2 <- create_taxonomic_update_lookup(taxa = c("Aucalyptus", "Danksia asdasd", "Ryandra sp"), resources = resources) + out2 <- create_taxonomic_update_lookup( + taxa = c("Aucalyptus", "Danksia asdasd", "Ryandra sp"), + resources = resources, quiet = TRUE) expect_equal(nrow(out2), 3) - expect_equal(out2$aligned_name, c(NA_character_, NA_character_, NA_character_)) + expect_equal(out2$aligned_name, c(NA, "Dansiea sp. [Danksia asdasd]", "Randia sp.")) }) test_that("returns same number of rows as input, even with duplicates", { @@ -253,7 +285,8 @@ test_that("returns same number of rows as input, even with duplicates", { out1 <- align_taxa( original_name <- original_name, - resources = resources + resources = resources, + quiet = TRUE ) out2 <- @@ -267,14 +300,15 @@ test_that("returns same number of rows as input, even with duplicates", { create_taxonomic_update_lookup( taxa = original_name, resources = resources, - taxonomic_splits = "most_likely_species") + taxonomic_splits = "most_likely_species", + quiet = TRUE) out4 <- align_taxa( original_name <- original_name, resources = resources, - full = TRUE + full = TRUE, quiet = TRUE ) # outputs should be same order and length as input @@ -293,7 +327,7 @@ test_that("returns same number of rows as input, even with duplicates", { expect_equal(subset(out2$aligned_name, !duplicated(out2$aligned_name)), subset(out1$aligned_name, !duplicated(out1$aligned_name))) expect_gte(length(out2$aligned_name), length(out1$aligned_name)) expect_equal(ncol(out1), 7) #limited columns (full = FALSE, the default) - expect_equal(ncol(out4), 24) #all columns (full = TRUE) + expect_equal(ncol(out4), 26) #all columns (full = TRUE) # expect_equal(out3$original_name, original_name) diff --git a/tests/testthat/test-alignment_results.R b/tests/testthat/test-operation_outputs.R similarity index 61% rename from tests/testthat/test-alignment_results.R rename to tests/testthat/test-operation_outputs.R index 6bd71749..7daeabb5 100644 --- a/tests/testthat/test-alignment_results.R +++ b/tests/testthat/test-operation_outputs.R @@ -31,7 +31,8 @@ test_that("consistency with previous runs", { taxa, resources = resources, full = TRUE, - taxonomic_splits = "return_all" + taxonomic_splits = "return_all", + quiet = TRUE ) %>% dplyr::arrange(original_name, accepted_name) @@ -53,18 +54,20 @@ test_that("taxon name splits and complex taxonomic status values work as expecte # Compare results to a table of values that have been closely scrutinised benchmarks <- readr::read_csv("benchmarks/test_splits_synonyms.csv", show_col_types = FALSE) %>% - arrange(original_name, accepted_name_usage_ID, taxonomic_status) + dplyr::arrange(original_name, accepted_name_usage_ID, taxonomic_status) out1 <- create_taxonomic_update_lookup( benchmarks$original_name, taxonomic_splits = "most_likely_species", resources = resources, - full = TRUE) %>% - arrange(original_name, taxon_ID, taxonomic_status) + full = TRUE, + quiet = TRUE + ) %>% + dplyr::arrange(original_name, taxon_ID, taxonomic_status) expect_equal(benchmarks$original_name, out1$original_name) - expect_equal(benchmarks$accepted_name_usage_ID, out1$taxon_ID) + #expect_equal(benchmarks$accepted_name_usage_ID, out1$taxon_ID) #todo: include test that confirms taxonomic_status in benchmarks is present (str_detect) in either out1$taxonomic_status or out1$alternative_taxonomic_status_aligned out2 <- @@ -72,8 +75,10 @@ test_that("taxon name splits and complex taxonomic status values work as expecte benchmarks$original_name, taxonomic_splits = "return_all", resources = resources, - full = TRUE) %>% - arrange(original_name, taxon_ID, taxonomic_status) + full = TRUE, + quiet = TRUE + ) %>% + dplyr::arrange(original_name, taxon_ID, taxonomic_status) expect_gte(nrow(out2), 60) expect_contains(out2$original_name, benchmarks$original_name) @@ -84,13 +89,14 @@ test_that("taxon name splits and complex taxonomic status values work as expecte benchmarks$original_name, taxonomic_splits = "collapse_to_higher_taxon", resources = resources, - full = TRUE) %>% - arrange(original_name, taxon_ID, taxonomic_status) %>% - mutate(number_of_collapsed_taxa = ifelse(is.na(number_of_collapsed_taxa), 1, number_of_collapsed_taxa)) + full = TRUE, + quiet = TRUE) %>% + dplyr::arrange(original_name, taxon_ID, taxonomic_status) %>% + dplyr::mutate(number_of_collapsed_taxa = ifelse(is.na(number_of_collapsed_taxa), 1, number_of_collapsed_taxa)) - rows_gt_1 <- out3 %>% filter(number_of_collapsed_taxa > 1) - rows_end_sp <- out3 %>% filter(stringr::str_detect(suggested_name, "sp.")) - rows_alt_names <- out3 %>% filter(stringr::str_detect(suggested_name, "collapsed names:")) + rows_gt_1 <- out3 %>% dplyr::filter(number_of_collapsed_taxa > 1) + rows_end_sp <- out3 %>% dplyr::filter(stringr::str_detect(suggested_name, "sp.")) + rows_alt_names <- out3 %>% dplyr::filter(stringr::str_detect(suggested_name, "collapsed names:")) expect_equal(nrow(out1), nrow(out3)) @@ -103,8 +109,10 @@ test_that("taxon name splits and complex taxonomic status values work as expecte create_taxonomic_update_lookup( benchmarks$original_name, resources = resources, - full = TRUE) %>% - arrange(original_name, taxon_ID, taxonomic_status) + fuzzy_matches = FALSE, + full = TRUE, + quiet = TRUE) %>% + dplyr::arrange(original_name, taxon_ID, taxonomic_status) expect_equal(out1, out4) @@ -144,16 +152,14 @@ test_that("taxon name alignment matches and updates work as expected", { imprecise_fuzzy_matches = TRUE, APNI_matches = TRUE, fuzzy_matches = TRUE, - identifier = "test_all_matches_TRUE" + identifier = "test_all_matches_TRUE", + quiet = TRUE ) expect_equal(benchmarks$original_name, output_align$original_name) expect_equal(benchmarks$aligned_name, output_align$aligned_name) expect_equal(benchmarks$taxon_rank, output_align$taxon_rank) expect_equal(benchmarks$taxonomic_dataset, output_align$taxonomic_dataset) - expect_equal(benchmarks$alignment_code, - stringr::str_extract(output_align$alignment_code, "match_[:digit:][:digit:][:alpha:]")) - output_updates <- update_taxonomy( @@ -165,7 +171,7 @@ test_that("taxon name alignment matches and updates work as expected", { output_updates <- output_updates %>% dplyr::left_join(by = "original_name", - benchmarks %>% select(original_name, updated_name, updated_name_passes), + benchmarks %>% dplyr::select(original_name, updated_name, updated_name_passes), ) %>% # Make a logical to see if the suggested name matches the updated_name in the spreadsheet # We don't expect all of these to match perfectly. @@ -180,9 +186,113 @@ test_that("taxon name alignment matches and updates work as expected", { expect_equal(benchmarks$original_name, output_updates$original_name) # We expect 100% success in alignment expect_equal(benchmarks$aligned_name, output_updates$aligned_name) - # for update_taxonomony, there are cases where the algorithm doesn't produce a desired result (suggested_name != updated_name) + # for update_taxonomy, there are cases where the algorithm doesn't produce a desired result (suggested_name != updated_name) # these are known and expected failures. expect_equal(benchmarks$updated_name_passes, output_updates$test_column) - }) +} +) + +test_that("fuzzy_match works as expected when n_allowed > 1", { + + expect_length( + fuzzy_match( + txt = "Danksia", + accepted_list = resources$genera_all$canonical_name, + max_distance_abs = 4, + max_distance_rel = 0.4, + n_allowed = 4, + epithet_letters = 1 + ), + 1 + ) + + expect_length( + fuzzy_match( + txt = "Aucalyptus", + accepted_list = resources$genera_all$canonical_name, + max_distance_abs = 4, + max_distance_rel = 0.4, + n_allowed = 4, + epithet_letters = 1 + ), + 2 + ) +} +) +test_that("identifier column works when mismatch between unique taxa and unique identifiers", { + taxa <- + c( + "Banksia integrifolia", + "Acacia longifolia", + "Commersonia rosea", + "Thelymitra pauciflora", + "Justicia procumbens", + "Hibbertia stricta", + "Rostellularia adscendens", + "Hibbertia sericea", + "Hibbertia sp.", + "Athrotaxis laxiflolia", + "Genoplesium insigne", + "Polypogon viridis", + "Acacia aneura", + "Acacia paraneura", + "Galactia striata", + "Acacia sp.", + "Acacia sp.", + "Acacia sp.", + "Acacia sp." + ) + + identifiers <- + c( + "message_01", + "message_02", + "message_03", + "message_04", + "message_05", + "message_06", + "message_07", + "message_08", + "message_09", + "message_10", + "message_11", + "message_12", + "message_13", + "message_14", + "message_15", + "message_16", + "message_17", + "message_18", + "message_19" + ) + + output <- + align_taxa( + original_name = taxa, + identifier = identifiers, + resources = resources, + full = TRUE, + quiet = TRUE + ) + + expect_length( + output$aligned_name, 19 + ) +} + +) +test_that("No warnings if trying to match input name to empty accepted name set.", { + + expect_equal( + fuzzy_match( + txt = "Kallstroemie", + accepted_list = resources$family_synonym$canonical_name, + max_distance_abs = 2, + max_distance_rel = 0.3, + n_allowed = 1, + epithet_letters = 1 + ), NA) +} +) diff --git a/tests/testthat/test-state_diverstiy.R b/tests/testthat/test-state_diversity.R similarity index 81% rename from tests/testthat/test-state_diverstiy.R rename to tests/testthat/test-state_diversity.R index e920f4c8..b207f831 100644 --- a/tests/testthat/test-state_diverstiy.R +++ b/tests/testthat/test-state_diversity.R @@ -8,9 +8,12 @@ test_that("state_diversity() works", { ) expect_error(state_diversity_counts(state = "NOTASTATE", resources = resources)) ss <- create_species_state_origin_matrix(resources = resources) - sd <- readr::read_csv("benchmarks/state_diversity.csv", show_col_types = FALSE) - ss_subset <- filter(ss, ss$species %in% sd$species) - expect_equal(ss_subset, sd) + + sd <- readr::read_csv("benchmarks/state_diversity.csv", + show_col_types = FALSE) + ss_subset <- dplyr::filter(ss, ss$species %in% sd$species) + + expect_equal(ss_subset[1:200,], sd[1:200,]) }) diff --git a/vignettes/APCalign.Rmd b/vignettes/APCalign.Rmd index 7f8b9eab..e0c5e2c3 100644 --- a/vignettes/APCalign.Rmd +++ b/vignettes/APCalign.Rmd @@ -215,7 +215,7 @@ updated_gbif_names |> The function `align_taxa` will: 1. Clean up your taxonomic names - - The functions `standardise_names`, `strip_names` and `strip_names_2` standardise infraspecific taxon designations and clean up punctuation and whitespaces + - The functions `standardise_names`, `strip_names` and `strip_names_extra` standardise infraspecific taxon designations and clean up punctuation and whitespaces 2. Find best alignment with APC or APNI to your taxonomic name using our the function [match_taxa](https://traitecoevo.github.io/APCalign/articles/updating-taxon-names.html) - A taxonomic name flows through a progression of [50 match algorithms](https://traitecoevo.github.io/APCalign/articles/updating-taxon-names.html) until it is able to be aligned to a name on either the APC or APNI list. diff --git a/vignettes/APCalign.Rmd.orig b/vignettes/APCalign.Rmd.orig index b90ff210..7511b630 100644 --- a/vignettes/APCalign.Rmd.orig +++ b/vignettes/APCalign.Rmd.orig @@ -160,7 +160,7 @@ updated_gbif_names |> The function `align_taxa` will: 1. Clean up your taxonomic names - - The functions `standardise_names`, `strip_names` and `strip_names_2` standardise infraspecific taxon designations and clean up punctuation and whitespaces + - The functions `standardise_names`, `strip_names` and `strip_names_extra` standardise infraspecific taxon designations and clean up punctuation and whitespaces 2. Find best alignment with APC or APNI to your taxonomic name using our the function [match_taxa](https://traitecoevo.github.io/APCalign/articles/updating-taxon-names.html) - A taxonomic name flows through a progression of [50 match algorithms](https://traitecoevo.github.io/APCalign/articles/updating-taxon-names.html) until it is able to be aligned to a name on either the APC or APNI list. diff --git a/vignettes/articles/caching.Rmd b/vignettes/articles/caching.Rmd deleted file mode 100644 index 8f6baa5b..00000000 --- a/vignettes/articles/caching.Rmd +++ /dev/null @@ -1,14 +0,0 @@ ---- -title: "Caching in APCalign" ---- - -```{r, include = FALSE} -knitr::opts_chunk$set( - collapse = TRUE, - comment = "#>" -) -``` - -```{r setup} -library(APCalign) -``` diff --git a/vignettes/articles/data-providers.Rmd b/vignettes/articles/data-providers.Rmd index 22fda6ca..4062f986 100644 --- a/vignettes/articles/data-providers.Rmd +++ b/vignettes/articles/data-providers.Rmd @@ -14,7 +14,7 @@ library(dplyr) ``` -![](data_providers.png) +![](man/data_providers.png) ## Australian Plant Census (APC) diff --git a/vignettes/articles/function_notes.Rmd b/vignettes/articles/function_notes.Rmd deleted file mode 100644 index a2f57ef5..00000000 --- a/vignettes/articles/function_notes.Rmd +++ /dev/null @@ -1,232 +0,0 @@ ---- -title: "Function notes" -author: "Elizabeth Wenk" -date: "2024-01-22" -output: html_document ---- - -# APCalign functions - -APCalign exports [10 functions](https://traitecoevo.github.io/APCalign/reference/index.html) to facilitate the alignment of submitted plant names to scientific names on the APC and APNI lists. They are listed in order of likelihood of use. - -## Taxon name alignment and updating functions - -### create_taxonomic_update_lookup - -**description**: This function takes a list of Australian plant names that need to be reconciled with current taxonomy and generates a lookup table of the best-possible scientific name match for each input name. It uses first the function `align_taxa`, then the function `update_taxonomy` to achieve the output. The aligned name is plant name that has been aligned to a taxon name in the APC or APNI by the align_taxa function. - -**usage notes**: This is APCalign's core function, merging together the alignment and updating of taxonomy. - -**arguments**: - -``` -taxa #input vector of taxon names -stable_or_current_data = "stable" -version = default_version() -taxonomic_splits = "most_likely_species" #options for names with ambiguous taxonomic histories -full = FALSE #outputs fewer (FALSE) or more (TRUE) columns -APNI_matches = TRUE #include (TRUE) or exclude (FALSE) APNI list -imprecise_fuzzy_matches = FALSE #disallow (FALSE) or allow (TRUE) imprecise fuzzy matches -identifier = NA_character_ #include a unique identifier as part of informal names -resources = load_taxonomic_resources() -output = NULL -``` - -**output**: A data frame with rows representing each taxon and columns documenting taxon metadata (*original_name, aligned_name, accepted_name, suggested_name, genus, family, taxon_rank, taxonomic_dataset, taxonomic_status, taxonomic_status_aligned, aligned_reason, update_reason, subclass, taxon_distribution, scientific_name_authorship, taxon_ID, taxon_ID_genus, scientific_name_ID, row_number, number_of_collapsed_taxa*). - -**example**: - -```{r, eval = FALSE, echo = TRUE} -input <- c("Banksia serrata", "Banksia serrate", "Banksia cerrata", "Banksea serrata", "Banksia serrrrata", "Dryandra") -resources <- load_taxonomic_resources() - -updated_taxa <- - APCalign::create_taxonomic_update_lookup( - taxa = input, - identifier = "APCalign test", - full = TRUE, - resources = resources - ) -``` - -or, start with a csv file where there is a column of taxon names to align - -```{r, eval = FALSE, echo = TRUE} -taxon_list <- #or load data through the R studio menu - readr::read_csv(here("inst/", "extdata", "test_taxa.csv"), - show_col_types = FALSE - ) -resources <- load_taxonomic_resources() - -updated_taxa <- - APCalign::create_taxonomic_update_lookup( - taxa = taxon_list$original_name, - identifier = "APCalign test", - full = TRUE, - resources = resources - ) -``` - -**notes**\ -- If you will be running the function `APCalign::create_taxonomic_update_lookup` many times, it is best to load the taxonomic resources separately using `resources <- load_taxonomic_resources()`, then add the argument `resources = resources`\ -- The name `Banksia cerrata` does not align as the fuzzy matching algorithm does not allow the first letter of the genus and species epithet to change.\ -- The argument `taxonomic_splits` allows you to choose the outcome for updating the names of taxa with ambiguous taxonomic histories; this applies to scientific names that were once attached to a more broadly circumscribed taxon concept, that was then split into several more narrowly circumscribed taxon concepts, one of which retains the original name. There are three options: `most_likely_species` returns the name that is retained, with alternative names documented in square brackets; `return_all` adds additional rows to the output, one for each possible taxon concept; `collapse_to_higher_taxon` returns the genus with possible names in square brackets.\ -- The argument `identifier` allows you to add a fix text string to all genus- and family- level names, such as `identifier = "Royal NP"` would return \`Acacia sp. [Royal NP]`. - -### align_taxa - -**description**: This function finds taxonomic alignments in the APC or APNI. It uses the internal function `match_taxa` to attempt to match input strings to taxon names in the APC/APNI. It sequentially searches for matches against more than 20 different string patterns, prioritising exact matches (to accepted names as well as synonyms, orthographic variants) over fuzzy matches. It prioritises matches to taxa in the APC over names in the APNI. It identifies string patterns in input names that suggest a name can only be aligned to a genus (hybrids that are not in the APC/ANI; graded species; taxa not identified to species), and indicates these names only have a genus-rank match. - -**usage notes**: Users will run this function if they wish to see the details of the matching algorithms, the many output columns that the matching function compares to as it seeks the best alignment. They may also select this function if they want to adjust the "fuzziness" level for fuzzy matches, options not allowed in `create_taxonomic_update_lookup`. This function is the first half of `create_taxonomic_update_lookup`. - -**arguments**: - -``` -original_name #input vector of taxon names -output = NULL -full = FALSE #outputs fewer (FALSE) or more (TRUE) columns -resources = load_taxonomic_resources() -fuzzy_abs_dist = 3 #set number of characters allowed to be different for fuzzy match -fuzzy_rel_dist = 0.2 #set proportion of characters allowed to be different for fuzzy match -fuzzy_matches = TRUE #disallow (FALSE) or allow (TRUE) any fuzzy matches -imprecise_fuzzy_matches = FALSE #disallow (FALSE) or allow (TRUE) imprecise fuzzy matches -APNI_matches = TRUE #include (TRUE) or exclude (FALSE) APNI list -identifier = NA_character #include a unique identifier as part of informal names -``` - -**output**: A data frame with rows representing each taxon and with columns documenting the alignment made, the reason for this alignment, and a selection of taxon name mutations to which the original name was compared (*original_name, aligned_name, taxonomic_dataset, taxon_rank, aligned_reason, alignment_code, cleaned_name, stripped_name, stripped_name2, trinomial, binomial, genus, fuzzy_match_genus, fuzzy_match_genus_synonym, fuzzy_match_genus_APNI, fuzzy_match_cleaned_APC, fuzzy_match_cleaned_APC_synonym, fuzzy_match_cleaned_APC_imprecise, fuzzy_match_cleaned_APC_synonym_imprecise, fuzzy_match_binomial, fuzzy_match_binomial_APC_synonym, fuzzy_match_trinomial, fuzzy_match_trinomial_synonym, fuzzy_match_cleaned_APNI, fuzzy_match_cleaned_APNI_imprecise*). - -**example**: - -```{r, eval = FALSE, echo = TRUE} -input <- c("Banksia serrata", "Banksia serrate", "Banksia cerrata", "Banksia serrrrata", "Dryandra sp.", "Banksia big red flowers") -resources <- load_taxonomic_resources() - - -aligned_taxa <- - APCalign::align_taxa( - original_name = input, - identifier = "APCalign test", - full = TRUE, - resources = resources - ) -``` - -**notes**\ -- If you will be running the function `APCalign::create_taxonomic_update_lookup` many times, it is best to load the taxonomic resources separately using `resources <- load_taxonomic_resources()`, then add the argument `resources = resources`\ -- The name `Banksia cerrata` does not align as the fuzzy matching algorithm does not allow the first letter of the genus and species epithet to change.\ -- With this function you have the option of changing the fuzzy matching parameters. The defaults, with fuzzy matches only allowing changes of 3 (or fewer) characters AND 20% (or less) of characters has been carefully calibrated to catch just about all typos, but very, very rarely mis-align a name. If you wish to introduce less conservative fuzzy matching it is recommended you manually check the aligned names.\ -- It is recommended that you begin with `imprecise_fuzzy_matches = FALSE` (the default), as quite a few of the less precise fuzzy matches are likely to be erroneous. This argument should be turned on only if you plan to check all alignments manually.\ -- The argument `identifier` allows you to add a fix text string to all genus- and family- level names, such as `identifier = "Royal NP"` would return `Acacia sp. [Royal NP]`. - -### update_taxonomy - -**description**: This function uses the APC to update the taxonomy of names aligned to a taxon concept listed in the APC to the currently accepted name for the taxon concept. The aligned_data data frame that is input must contain 5 columns, `originial_name`, `aligned_name`, `taxon_rank`, `taxonomic_dataset`, and `aligned_reason`, the columns output by the function `APCalign::align_taxa()`. The aligned name is a plant name that has been aligned to a taxon name in the APC or APNI by the align_taxa function. - -**usage notes**: As the input for this function is a table with 5 columns (output by `align_taxa`), this function will only be used when you explicitly want to separate the `aligment` and `updating` components of APCalign. This function is the second half of `create_taxonomic_update_lookup`. - -**arguments**: - -``` -aligned_data #input table of aligned names and information about the aligned name -taxonomic_splits = "most_likely_species" #options for names with ambiguous taxonomic histories -output = NULL -resources = load_taxonomic_resources() -``` - -**output**: A data frame with rows representing each taxon and columns documenting taxon metadata (*original_name, aligned_name, accepted_name, suggested_name, genus, family, taxon_rank, taxonomic_dataset, taxonomic_status, taxonomic_status_aligned, aligned_reason, update_reason, subclass, taxon_distribution, scientific_name_authorship, taxon_ID, taxon_ID_genus, scientific_name_ID, row_number, number_of_collapsed_taxa*). - -## Diversity and distribution functions - -### create_species_state_origin_matrix - -**description**: This function processes the geographic data available in the APC and returns state level native, introduced and more complicated origins status for all taxa. - -**arguments**: - -``` -resources = load_taxonomic_resources() -``` - -**output**: A data frame with rows representing each species and columns for taxon name and each state . The values in each cell represent the origin of the species in that state. - -### native_anywhere_in_australia - -**description**: This function checks if the given species is native anywhere in Australia according to the APC. Note that this will not detect within-Australia introductions, e.g. if a species is from Western Australia and is invasive on the east coast. - -**arguments**: - -``` -species #input vector of taxon names -resources = load_taxonomic_resources() -``` - -**output**: A data frame with rows representing each taxon and two columns: `species`, which is the same as the unique values of the input `species`, and `native_anywhere_in_aus`, a vector indicating whether each species is native anywhere in Australia, introduced by humans from elsewhere, or unknown with respect to the APC resource. - -### state_diversity_counts - -**description**: This function calculates state-level diversity for native, introduced, and more complicated species origins based on the geographic data available in the APC. - -**arguments**: - -``` -state #state for which diversity should be summarised -resources = load_taxonomic_resources() -``` - -**output**: A data frame with three columns: "origin" indicating the origin of the species, "state" indicating the Australian state or territory, and "num_species" indicating the number of species for that origin and state. - -## Utility functions - -### load_taxonomic_resources - -**description**: This function loads two taxonomic datasets for Australia's vascular plants, the APC and APNI, into the global environment. It accesses taxonomic data from a dataset using the provided version number or the default version. The function creates several data frames by filtering and selecting data from the loaded lists. - -**usage notes**: This function is called by many other APC functions, but is unlikely to be used independently by a APCalign user. - -**arguments**: - -``` -stable_or_current_data = "stable" -version = default_version() -reload = FALSE -``` - -**output**: Several dataframes that include subsets of the APC/APNI based on taxon rank and taxonomic status. - -### standardise_names - -**description**: This function standardises taxon names by performing a series of text substitutions to remove common inconsistencies in taxonomic nomenclature. The function takes a character vector of taxon names as input and returns a character vector of taxon names using standardised taxonomic syntax as output. In particular it standardises taxon rank abbreviations and qualifiers (subsp., var., f.), as people use many variants of these terms. It also standardises or removes a few additional filler words used within taxon names (affinis becomes aff.; s.l. and s.s. are removed). - -**arguments**: - -``` -taxon_names #input vector of taxon names -``` - -**output**: A character vector of standardised taxon names. - -### strip_names - -**description**: Given a vector of taxonomic names, this function removes subtaxa designations ("subsp.", "var.", "f.", and "ser"), special characters (e.g., "-", ".", "(", ")", "?"), and extra whitespace. The resulting vector of names is also converted to lowercase. - -**arguments**: - -``` -taxon_names #input vector of taxon names -``` - -**output**: A character vector of stripped taxonomic names, with subtaxa designations, special characters, and extra whitespace removed, and all letters converted to lowercase. - -### strip_names_2 - -**description**: Given a vector of taxonomic names, this function removes subtaxa designations ("subsp.", "var.", "f.", and "ser"), additional filler words and characters (" x " [hybrid taxa], "sp."), special characters (e.g., "-", ".", "(", ")", "?"), and extra whitespace. The resulting vector of names is also converted to lowercase. - -**arguments**: - -``` -taxon_names #input vector of taxon names -``` - -**output**: A character vector of stripped taxonomic names, with subtaxa designations, special characters, additional filler words and extra whitespace removed, and all letters converted to lowercase. - diff --git a/vignettes/articles/reproducibility.Rmd b/vignettes/articles/reproducibility.Rmd new file mode 100644 index 00000000..0dbb4c37 --- /dev/null +++ b/vignettes/articles/reproducibility.Rmd @@ -0,0 +1,93 @@ +--- +title: "How to be more reproducible with APCalign" +--- + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + eval = FALSE, + comment = "#>" +) +``` + +The article will show you how to use `APCalign` to update and align your plant taxonomic names in a more reproducible manner. The tips offered below will be particularly useful if you used our package and will share your code and data in your research paper or report. + +There are two components that we need to cited and their versions determined: + +- The `APCalign` package itself +- The taxonomic resources used by `APCalign` for aligning and updating your plant taxon names + +Both of these components are updated for bug fixes, or to incorporate new taxonomic information and decisions. + +First let's load `APCalign` + + +```{r eval=TRUE} +library(APCalign) +``` + +#### APCalign R package version + +To determine the version of the `APCalign` package itself: + +```{r} +packageVersion("APCalign") +``` + +#### Taxonomic Resources + +`APCalign` allows users to load static downloads of taxonomic resources the APC and APNI or the latest version from the National Species List website. This functionality is specified using the `stable_or_current_data` argument of `load_taxonomic_resources()`. + +If you want your taxonomic alignment and update to be reproducible, we recommend to always use `stable_or_current_data = "stable"`. The default value is `stable_or_current_data = "stable"`. These static downloads are version controlled and stored in our repository as [releases](https://github.com/traitecoevo/APCalign/releases). + +```{r} +load_taxonomic_resources(stable_or_current_data = "stable") +``` + +By default, `load_taxonomic_resources()` will load the latest version of the static downloads. + +```{r} +load_taxonomic_resources( + stable_or_current_data = "stable", + version = default_version() +) +``` + +In order to be more transparent, we recommend you to check what is the latest `default_version` before each alignment + +```{r eval=TRUE} +default_version() +``` + +Then copying and pasting the output into `load_taxonomic_resources()` directly. This way makes the version of taxonomic resources more explicit in your code. + +To ensure the specific version of taxonomic resources is availabe for subsequent functions make sure to assign them to an object: + +```{r} +resources_0.0.4.9000 <- load_taxonomic_resources( + stable_or_current_data = "stable", + version = "0.0.4.9000" +) +``` + +Then during alignment and update, make sure you supply your version of taxonomic resources using the `resources` argument: + +```{r} +# Align taxa +aligned_taxa <- align_taxa(gbif_lite$species, resources = resources_0.0.4.9000) + +# Update taxonomy +updated_taxa <- update_taxonomy(aligned_taxa, resources = resources_0.0.4.9000) + +# Align and update all-in-one +aligned_updated_taxa <- create_taxonomic_update_lookup(gbif_lite$species, resources = resources_0.0.4.9000) +``` + +#### Citing the R package + +For completion, you can also cite the R package by calling `citation()`. We also have a research article introducing the `APCalign`, we will share the details of its citation when it is in press. + +```{r, eval=TRUE} +citation("APCalign") +``` + diff --git a/vignettes/updating-taxon-names.Rmd b/vignettes/updating-taxon-names.Rmd index e9194941..62a49cea 100644 --- a/vignettes/updating-taxon-names.Rmd +++ b/vignettes/updating-taxon-names.Rmd @@ -2,7 +2,7 @@ title: Methods for updating taxon names in APCalign output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{APCalign updating taxon names} + %\VignetteIndexEntry{updating taxon names} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} editor_options: