Skip to content

Commit

Permalink
feat: match_name gains join_id col, allowing for an initial match…
Browse files Browse the repository at this point in the history
…ing override based on some unique ID column (#460)
  • Loading branch information
jdhoffa authored Mar 14, 2024
1 parent faf1216 commit 530f912
Show file tree
Hide file tree
Showing 6 changed files with 320 additions and 16 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# r2dii.match (development version)

* `match_name` gains argument `join_id` allowing an optional perfect join based on a mutual ID column between `loanbook` and `abcd` inputs, prior to attempting fuzzy matching (#135).

# r2dii.match 0.1.4

* `to_alias` can now handle strange encodings without error (#425, @kalashsinghal @Tilmon).
Expand Down
169 changes: 155 additions & 14 deletions R/match_name.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,10 @@
#' only `sector`, the value in the `name` column should be `NA` and
#' vice-versa. This file can be used to manually match loanbook companies to
#' abcd.
#' @param join_id A join specification passed to [dplyr::inner_join()]. If a
#' character string, it assumes identical join columns between `loanbook` and
#' `abcd`. If a named character vector, it uses the name as the join column of `loanbook` and
#' the value as the join column of `abcd`.
#' @param ... Arguments passed on to [stringdist::stringsim()].
#' @param ald `r lifecycle::badge('superseded')` `ald` has been superseded by
#' `abcd`.
Expand Down Expand Up @@ -83,6 +87,25 @@
#' code_system = "XYZ"
#' )
#'
#' # match on LEI
#' loanbook <- tibble(
#' sector_classification_system = "XYZ",
#' sector_classification_direct_loantaker = "D35.11",
#' id_ultimate_parent = "UP15",
#' name_ultimate_parent = "Won't fuzzy match",
#' id_direct_loantaker = "C294",
#' name_direct_loantaker = "Won't fuzzy match",
#' lei_direct_loantaker = "LEI123"
#' )
#'
#' abcd <- tibble(
#' name_company = "alpine knits india pvt. limited",
#' sector = "power",
#' lei = "LEI123"
#' )
#'
#' match_name(loanbook, abcd, join_by = c(lei_direct_loantaker = "lei"))
#'
#' restore <- options(r2dii.match.sector_classifications = your_classifications)
#'
#' loanbook <- tibble(
Expand Down Expand Up @@ -111,6 +134,7 @@ match_name <- function(loanbook,
method = "jw",
p = 0.1,
overwrite = NULL,
join_id = NULL,
ald = deprecated(),
...) {
restore <- options(datatable.allow.cartesian = TRUE)
Expand All @@ -125,16 +149,77 @@ match_name <- function(loanbook,
abcd <- ald
}

match_name_impl(
loanbook = loanbook,
abcd = abcd,
by_sector = by_sector,
min_score = min_score,
method = method,
p = p,
overwrite = overwrite,
...
)
if (!is.null(join_id)) {
check_join_id(join_id, loanbook, abcd)

crucial_names <- c("name_company", "sector", join_id)
check_crucial_names(abcd, crucial_names)

prep_abcd <- dplyr::transmute(
abcd,
name_abcd = .data[["name_company"]],
sector_abcd = tolower(.data[["sector"]]),
!!join_id := .data[[join_id]]
)

prep_abcd <- dplyr::distinct(prep_abcd)

prep_lbk <- may_add_sector_and_borderline(loanbook)
prep_lbk <- distinct(prep_lbk)

join_matched <- dplyr::inner_join(
prep_lbk,
prep_abcd,
by = join_id,
na_matches = "never"
)

join_by_list <- as_join_by(join_id)
loanbook_join_id <- join_by_list[[1]]

join_matched <- dplyr::mutate(
join_matched,
score = 1,
source = "id joined",
level = loanbook_join_id,
name = .data[["name_abcd"]]
)

loanbook <- dplyr::filter(
loanbook,
!.data[[loanbook_join_id]] %in% join_matched[[loanbook_join_id]]
)
}

if (nrow(loanbook) != 0) {
fuzzy_matched <- match_name_impl(
loanbook = loanbook,
abcd = abcd,
by_sector = by_sector,
min_score = min_score,
method = method,
p = p,
overwrite = overwrite,
...
)
} else {
fuzzy_matched <- tibble()
}

if (exists("join_matched")) {
out <- dplyr::bind_rows(join_matched, fuzzy_matched)
} else if (nrow(fuzzy_matched) == 0 && exists("join_matched")) {
out <- join_matched
} else {
out <- fuzzy_matched
}

if (identical(nrow(out), 0L)) {
rlang::warn("Found no match.")
return(empty_loanbook_tibble(loanbook, dplyr::groups(loanbook)))
}

out
}

match_name_impl <- function(loanbook,
Expand All @@ -145,6 +230,7 @@ match_name_impl <- function(loanbook,
p = 0.1,
overwrite = NULL,
...) {

old_groups <- dplyr::groups(loanbook)
loanbook <- ungroup(loanbook)

Expand All @@ -164,8 +250,8 @@ match_name_impl <- function(loanbook,
setDT(a)

if (identical(nrow(a), 0L)) {
rlang::warn("Found no match.")
return(empty_loanbook_tibble(loanbook, old_groups))
rlang::inform("Found no match via fuzzy matching.")
return(a)
}

a <- unique(a)[
Expand All @@ -179,8 +265,8 @@ match_name_impl <- function(loanbook,
a <- a[score >= min_score, ]

if (identical(nrow(a), 0L)) {
rlang::warn("Found no match.")
return(empty_loanbook_tibble(loanbook, old_groups))
rlang::inform("Found no match via fuzzy matching.")
return(a)
}

l <- rename(prep_lbk, alias_lbk = "alias")
Expand Down Expand Up @@ -349,3 +435,58 @@ names_added_by_match_name <- function() {
"borderline"
)
}

check_join_id <- function(join_id, loanbook, abcd) {

join_id_list <- as_join_by(join_id)

if (!rlang::has_name(loanbook, join_id_list[[1]])) {
rlang::abort(
"join_id_not_in_loanbook",
message = glue(
"The join_id `{join_id_list[[1]]}` must be present in `loanbook` input."
)
)
} else if (!rlang::has_name(abcd, join_id_list[[2]])) {
rlang::abort(
"join_id_not_in_abcd",
message = glue(
"The join_id `{join_id_list[[2]]}` must be present in `abcd` input."
)
)
}

invisible(join_id)
}

as_join_by <- function(x) {

if (rlang::is_list(x)) {
if (length(x) != 1L) {
rlang::abort("`join_id` must be a vector of length 1.")
}
x_name <- names(x) %||% x
y_name <- unname(x)
} else if (rlang::is_character(x)) {
x_name <- names(x) %||% x
y_name <- unname(x)

# If x partially named, assume unnamed are the same in both tables
x_name[x_name == ""] <- y_name[x_name == ""]
} else {
rlang::abort("`by` must be a string or a character vector.")
}

if (!rlang::is_character(x_name)) {
rlang::abort("`by$x` must evaluate to a character vector.")
}
if (!rlang::is_character(y_name)) {
rlang::abort("`by$y` must evaluate to a character vector.")
}

c(x_name, y_name)
}

`%||%` <- function(x, y) {
if (is.null(x)) y else x
}
9 changes: 7 additions & 2 deletions R/restructure_abcd.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#' from values in the `name_company` column.
#'
#' @param data A data frame. Should be an asset-level dataset.
#' @param join_id (Optional) A string giving the name of an `ID` column to
#' preserve in the restructuring
#'
#' @seealso [r2dii.data::abcd_demo] `to_alias()`.
#'
Expand All @@ -15,12 +17,15 @@
#' restructure_abcd(r2dii.data::abcd_demo)
#' @noRd
restructure_abcd <- function(data) {
check_crucial_names(data, c("name_company", "sector"))
crucial_names <- c("name_company", "sector")
check_crucial_names(data, crucial_names)

out <- dplyr::transmute(
data,
name = .data$name_company, sector = tolower(.data$sector)
name = .data$name_company,
sector = tolower(.data$sector)
)

out <- distinct(out)
add_alias(out)
}
Expand Down
25 changes: 25 additions & 0 deletions man/match_name.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 530f912

Please sign in to comment.