Skip to content

Commit

Permalink
Apply our faster implementation of word throughout (#220)
Browse files Browse the repository at this point in the history
As part of #196, we found that stringr::word was quite slow, and so implemented a faster version. This PR makes the new word function a private function accessible via APCalign:::word; 
adds tests for new function; 
extends use of this new function throughout

Co-authored-by: ehwenk <[email protected]>
  • Loading branch information
dfalster and ehwenk authored May 1, 2024
1 parent 9733c70 commit 9be9776
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 50 deletions.
2 changes: 1 addition & 1 deletion R/create_species_state_origin_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ separate_states <- function(data) {
#' @noRd
identify_places <- function(sep_state_data) {
all_codes <- unique(stringr::str_trim(unlist(sep_state_data)))
unique(stringr::word(all_codes[!is.na(all_codes)], 1, 1))
unique(word(all_codes[!is.na(all_codes)], 1, 1))
}

#' @noRd
Expand Down
32 changes: 0 additions & 32 deletions R/load_taxonomic_resources.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,38 +55,6 @@ load_taxonomic_resources <-
### Note: Use `zzzz zzzz` because the fuzzy matching algorithm can't handles NA's
zzz <- "zzzz zzzz"

word <- function(string, start = 1L, end = start) {
if(end == start) {
str_split_i(string, " ", start)
} else if(end == start+1) {
w1 <- str_split_i(string, " ", start)
w2 <- str_split_i(string, " ", start+1)

out <- paste(w1, w2)
out[is.na(w2)] <- NA_character_

out
} else if(end == start+2) {

w1 <- str_split_i(string, " ", start)
w2 <- str_split_i(string, " ", start+1)
w3 <- str_split_i(string, " ", start+2)

out <- paste(w1, w2, w3)
out[is.na(w2) | is.na(w3)] <- NA_character_

out
} else {
i <- seq(start, end)

txt <- str_split(string, " ")
lngth <- purrr::map_int(txt, length)
out <- purrr::map(txt, ~paste(.x[i], collapse = " "))
out[lngth < end] <- NA
out
}
}

taxonomic_resources$APC <- taxonomic_resources$APC %>%
rename(
taxon_ID = .data$taxonID,
Expand Down
24 changes: 12 additions & 12 deletions R/match_taxa.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@ match_taxa <- function(
update_na_with(strip_names(cleaned_name)),
stripped_name2 = stripped_name2 %>%
update_na_with(strip_names_extra(stripped_name)),
trinomial = stringr::word(stripped_name2, start = 1, end = 3),
binomial = stringr::word(stripped_name2, start = 1, end = 2),
trinomial = word(stripped_name2, start = 1, end = 3),
binomial = word(stripped_name2, start = 1, end = 2),
genus = extract_genus(original_name)
)

Expand Down Expand Up @@ -224,7 +224,7 @@ match_taxa <- function(
i <-
stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") &
taxa$tocheck$genus %in% resources$genera_all2$genus &
stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")

ii <-
match(
Expand Down Expand Up @@ -277,7 +277,7 @@ match_taxa <- function(
i <-
stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") &
taxa$tocheck$fuzzy_match_genus %in% resources$genera_accepted$genus &
stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")

ii <-
match(
Expand Down Expand Up @@ -316,7 +316,7 @@ match_taxa <- function(
i <-
stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") &
taxa$tocheck$fuzzy_match_genus_synonym %in% resources$genera_synonym$genus &
stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")

ii <-
match(
Expand Down Expand Up @@ -353,7 +353,7 @@ match_taxa <- function(
i <-
stringr::str_detect(taxa$tocheck$cleaned_name, "[:space:]sp\\.$") &
taxa$tocheck$genus %in% resources$family_accepted$canonical_name &
stringr::word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")
word(taxa$tocheck$cleaned_name, 2) %in% c("sp.")

taxa$tocheck[i,] <- taxa$tocheck[i,] %>%
mutate(
Expand Down Expand Up @@ -524,7 +524,7 @@ match_taxa <- function(
mutate(
taxonomic_dataset = NA_character_,
taxon_rank = NA,
aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " sp. [", cleaned_name),
aligned_name_tmp = paste0(word(cleaned_name,1), " sp. [", cleaned_name),
aligned_name = NA,
aligned_reason = paste0(
"Taxon name includes '--' (double dash) indicating an intergrade between two taxa, but exact and fuzzy matches fail to align to a genus in the APC or APNI (",
Expand Down Expand Up @@ -726,7 +726,7 @@ match_taxa <- function(
mutate(
taxonomic_dataset = NA_character_,
taxon_rank = NA,
aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " sp. [", cleaned_name),
aligned_name_tmp = paste0(word(cleaned_name,1), " sp. [", cleaned_name),
aligned_name = NA,
aligned_reason = paste0(
"Taxon name includes '/' (slash) indicating an uncertain species identification but exact and fuzzy matches fail to align to a genus in the APC or APNI (",
Expand Down Expand Up @@ -1033,7 +1033,7 @@ match_taxa <- function(
mutate(
taxonomic_dataset = NA_character_,
taxon_rank = NA,
aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " sp. [", cleaned_name),
aligned_name_tmp = paste0(word(cleaned_name,1), " sp. [", cleaned_name),
aligned_name = NA,
aligned_reason = paste0(
"Taxon name includes 'affinis' or 'aff' indicating an unknown taxon that bears an affinity to a different taxon in the same genus, but exact and fuzzy matches fail to align to a genus in the APC or APNI (",
Expand Down Expand Up @@ -1297,7 +1297,7 @@ match_taxa <- function(
mutate(
taxonomic_dataset = NA_character_,
taxon_rank = NA,
aligned_name_tmp = paste0(stringr::word(cleaned_name,1), " x [", cleaned_name),
aligned_name_tmp = paste0(word(cleaned_name,1), " x [", cleaned_name),
aligned_name = NA,
aligned_reason = paste0(
"Taxon name includes ' x ' indicating a hybrid, but exact and fuzzy matches fail to align to a genus in the APC or APNI (",
Expand Down Expand Up @@ -1927,7 +1927,7 @@ match_taxa <- function(
# The 'taxon name' is then reformatted as `family sp.` with the original name in square brackets.

i <-
stringr::str_detect(stringr::word(taxa$tocheck$cleaned_name, 1), "aceae$") &
stringr::str_detect(word(taxa$tocheck$cleaned_name, 1), "aceae$") &
taxa$tocheck$genus %in% resources$family_accepted$canonical_name

taxa$tocheck[i,] <- taxa$tocheck[i,] %>%
Expand Down Expand Up @@ -1958,7 +1958,7 @@ match_taxa <- function(
# The 'taxon name' is then reformatted as `family sp.` with the original name in square brackets.

i <-
stringr::str_detect(stringr::word(taxa$tocheck$cleaned_name, 1), "ae$") &
stringr::str_detect(word(taxa$tocheck$cleaned_name, 1), "ae$") &
taxa$tocheck$genus %in% resources$family_synonym$canonical_name

taxa$tocheck[i,] <- taxa$tocheck[i,] %>%
Expand Down
8 changes: 4 additions & 4 deletions R/update_taxonomy.R
Original file line number Diff line number Diff line change
Expand Up @@ -456,7 +456,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources,
dplyr::group_by(canonical_name) %>%
dplyr::mutate(
number_of_collapsed_taxa = sum(number_of_collapsed_taxa),
accepted_name_2 = paste(stringr::word(accepted_name_2, 1), "sp."),
accepted_name_2 = paste(word(accepted_name_2, 1), "sp."),
alternative_possible_names =
alternative_accepted_name_tmp %>%
unique() %>%
Expand All @@ -469,7 +469,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources,
dplyr::mutate(
alternative_possible_names = ifelse(taxonomic_status_aligned != "accepted" & canonical_name %in% resources$'APC list (accepted)'$canonical_name, NA, alternative_possible_names),
alternative_possible_names = stringr::str_replace_all(alternative_possible_names, "\\ \\|\\ NA", ""),
suggested_collapsed_name = paste(stringr::word(accepted_name_2, 1), "sp. [collapsed names:", alternative_possible_names, "]"),
suggested_collapsed_name = paste(word(accepted_name_2, 1), "sp. [collapsed names:", alternative_possible_names, "]"),
taxon_rank = ifelse(number_of_collapsed_taxa > 1 & species_and_infraspecific(taxon_rank), "genus", taxon_rank)
) %>%
dplyr::select(-alternative_accepted_name_tmp, -alternative_possible_names)
Expand Down Expand Up @@ -561,7 +561,7 @@ update_taxonomy_APC_species_and_infraspecific_taxa <- function(data, resources,
## there are rare cases of names within the APC that do not align to an accepted name.
## For these taxa, the `suggested_name` is the `aligned_name` and the family name must be added
genus = ifelse(is.na(genus_accepted), genus, genus_accepted),
family = ifelse(is.na(family), resources$APC$family[match(stringr::word(suggested_name, 1), resources$APC$genus)], family),
family = ifelse(is.na(family), resources$APC$family[match(word(suggested_name, 1), resources$APC$genus)], family),
update_reason = ifelse(
(number_of_collapsed_taxa > 1) & !is.na(number_of_collapsed_taxa),
"collapsed to genus due to ambiguity",
Expand Down Expand Up @@ -609,7 +609,7 @@ update_taxonomy_APNI_species_and_infraspecific_taxa <- function(data, resources)
aligned_name,
suggested_name
),
genus = stringr::word(suggested_name, 1)
genus = word(suggested_name, 1)
) %>%
# when possible the genus of APNI names is matched to an APC-accepted genus and the appropriate genus-level taxon_ID is added
dplyr::left_join(
Expand Down
47 changes: 47 additions & 0 deletions R/word.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#' Extract words from a sentence. Intended as a faster
#' replacement for stringr::word
#'
#' @param string A character vector

#' @param start,end Pair of integer vectors giving range of words (inclusive)
#' to extract. The default value select the first word.
#' @param sep Separator between words. Defaults to single space.
#' @return A character vector with the same length as `string`/`start`/`end`.
#'
#' @examples
#' spp <- c("Banksia serrata", "Actinotus helanthii")
#' APCalign:::word(spp, 1)
#' APCalign:::word(spp, 2)
word <- function(string, start = 1L, end = start, sep = " ") {
if(end == start) {
str_split_i(string, " ", start)
} else if(end == start+1) {
w1 <- str_split_i(string, sep, start)
w2 <- str_split_i(string, sep, start+1)

out <- paste(w1, w2)
out[is.na(w2)] <- NA_character_

return(out)
} else if(end == start+2) {

w1 <- str_split_i(string, sep, start)
w2 <- str_split_i(string, sep, start+1)
w3 <- str_split_i(string, sep, start+2)

out <- paste(w1, w2, w3)
out[is.na(w2) | is.na(w3)] <- NA_character_

return(out)
} else {
i <- seq(start, end)

txt <- str_split(string, sep)
out <- purrr::map(txt, ~paste(.x[i], collapse = sep))

lngth <- purrr::map_int(txt, length)
out[lngth < end] <- NA

return(out)
}
}
29 changes: 29 additions & 0 deletions man/word.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

24 changes: 24 additions & 0 deletions tests/testthat/test-functions-word.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
test_that("Word", {

taxa <-
c(
NA,
"Banksia integrifolia",
"Acacia longifolia",
"Commersonia rosea",
"Thelymitra pauciflora",
"Justicia procumbens",
"Hibbertia",
"Rostellularia long leaves",
"Hibbertia sericea var silliafolius",
"Hibbertia sp.",
"x Cynochloris macivorii",
"(Dockrillia pugioniformis x Dockrillia striolata) x Dockrillia pugioniformis"
)

expect_equal(APCalign:::word(taxa, 1), stringr::word(taxa, 1))
expect_equal(APCalign:::word(taxa, 2), stringr::word(taxa, 2))
expect_equal(APCalign:::word(taxa, 3), stringr::word(taxa, 3))
expect_equal(APCalign:::word(taxa, 1,2), stringr::word(taxa, 1,2))
expect_equal(APCalign:::word(taxa, 1,3), stringr::word(taxa, 1,3))
})
2 changes: 1 addition & 1 deletion tests/testthat/test-operation_executes.R
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ test_that("handles NAs inn inputs", {
expect_equal(original_name, out2$original_name)
expect_equal(original_name, out2$aligned_name)
expect_equal(original_name, out2$accepted_name)
expect_equal(original_name[1], stringr::word(out2$suggested_name[1], start = 1, end = 2))
expect_equal(original_name[1], word(out2$suggested_name[1], start = 1, end = 2))

})

Expand Down

0 comments on commit 9be9776

Please sign in to comment.