Skip to content

Commit

Permalink
makes upload and query more robust
Browse files Browse the repository at this point in the history
  • Loading branch information
JBGruber committed Sep 21, 2023
1 parent 4092e01 commit e71f1cc
Show file tree
Hide file tree
Showing 4 changed files with 43 additions and 21 deletions.
26 changes: 16 additions & 10 deletions R/index.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,19 +78,24 @@ modify_index <- function(index, name = index, description = NULL, guest_role = N
#' Upload documents
#'
#' @param index The index name to create.
#' @param documents A data frame with columns title, text, date, and optional
#' other columns.
#' @param columns An optional list with data types, e.g. list(author = "keyword").
#' @param chunk_size Uploads are broken into chunks to prevent errors. Smaller
#' chunks are less error-prone, but this also makes the upload slower.
#' @param documents A data frame with columns title, text, date, and
#' optional other columns.
#' @param columns An optional list with data types, e.g. list(author =
#' "keyword").
#' @param chunk_size Uploads are broken into chunks to prevent errors.
#' Smaller chunks are less error-prone, but this also makes the
#' upload slower.
#' @param max_tries In case something goes wrong, how often should the
#' function retry to send the documents?
#' @param verbose Should a progress bar be printed during upload.
#' @param credentials The credentials to use. If not given, uses last login
#' information.
#' @param credentials The credentials to use. If not given, uses last
#' login information.
#' @export
upload_documents <- function(index,
documents,
columns = NULL,
chunk_size = 100L,
max_tries = 5L,
verbose = TRUE,
credentials = NULL) {
req_fields <- c("title", "date", "text") # hard coded, might change later
Expand All @@ -103,14 +108,15 @@ upload_documents <- function(index,
# chunk uploads
rows <- seq_len(nrow(documents))
chunks <- split(rows, ceiling(seq_along(rows) / chunk_size))
if (verbose & length(chunks) > 1L) pb <- progress::progress_bar$new(total = length(chunks))
if (verbose & length(chunks) > 1L) cli::cli_progress_bar("Uploading", total = length(chunks))
for (r in chunks) {
if (verbose & length(chunks) > 1L) pb$tick()
if (verbose & length(chunks) > 1L) cli::cli_progress_update()
body <- list(documents = documents[r, ])
if (!is.null(columns)) body$columns <- lapply(columns, jsonlite::unbox)
request(credentials, c("index", index, "documents"), "POST", body, auto_unbox = FALSE) |>
request(credentials, c("index", index, "documents"), "POST", body, max_tries = max_tries, auto_unbox = FALSE) |>
invisible()
}
if (verbose & length(chunks) > 1L) cli::cli_progress_done()
}


Expand Down
13 changes: 9 additions & 4 deletions R/query.R
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,11 @@ query_documents <- function(index,
max_pages_old <- max_pages
max_pages <- 10000 %/% per_page
cli::cli_alert_warning(
c("You requested more than 10 000 results {per_page} * {max_pages} ",
c("You requested more than 10 000 results {per_page} * {max_pages_old} ",
"(per_page * max_pages) = {per_page * max_pages}, which will not ",
"work. If you want more than 10 000 documents, you need to use the ",
"scroll API, e.g., by setting scroll=\"5m\". For now, you will ",
"only ge the first {max_pages} pages.")
"{.emph scroll API}, e.g., by setting {.code scroll=\"5m\"}. For now, ",
"you will only ge the first {max_pages} pages.")
)
}
}
Expand Down Expand Up @@ -140,8 +140,13 @@ query_documents <- function(index,
# requesting a specific page. scroll takes precedence in the API, hence
# when scroll != NULL, page is ignored
if (is.null(scroll)) {
r <<- r
body$page <- body$page + 1
if (body$page >= r$meta$page_count) break
# for when user sets page = NULL
if (length(body$page) == 0) {
body$page <- 1L
}
if (isTRUE(body$page >= r$meta$page_count)) break
} else {
body$scroll_id <- r$meta$scroll_id
}
Expand Down
20 changes: 13 additions & 7 deletions man/upload_documents.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions tests/testthat/test-query.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ test_that("query", {
query_documents("amcat4r-test", queries = NULL, per_page = 1, page = 2, max_pages = 2)
)))

expect_length(
query_documents("amcat4r-test", queries = NULL, per_page = 1, max_pages = 10)$.id,
10L
)

expect_equal(
colnames(
query_aggregate("amcat4r-test",
Expand Down

0 comments on commit e71f1cc

Please sign in to comment.