Skip to content

Commit

Permalink
Merge pull request #131 from myushen/multiple_metadata_from_api
Browse files Browse the repository at this point in the history
metadata accepts multiple databases
  • Loading branch information
stemangiola authored Feb 28, 2024
2 parents a338712 + 1ab4cd7 commit d645638
Show file tree
Hide file tree
Showing 16 changed files with 114 additions and 80 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ biocViews:
Transcription,
Transcriptomics
Encoding: UTF-8
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
LazyDataCompression: xz
URL: https://github.com/stemangiola/CuratedAtlasQueryR
BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
# Generated by roxygen2: do not edit by hand

S3method(as.sparse,DelayedMatrix)
export(DATABASE_URL)
export(SAMPLE_DATABASE_URL)
export(get_SingleCellExperiment)
export(get_database_url)
export(get_metadata)
export(get_seurat)
export(get_single_cell_experiment)
Expand Down Expand Up @@ -43,6 +43,7 @@ importFrom(dplyr,tbl)
importFrom(dplyr,transmute)
importFrom(duckdb,duckdb)
importFrom(glue,glue)
importFrom(glue,glue_sql)
importFrom(httr,GET)
importFrom(httr,HEAD)
importFrom(httr,modify_url)
Expand Down
2 changes: 1 addition & 1 deletion R/counts.R
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ get_single_cell_experiment <- function(
has_name(raw_data, c("cell_", "file_id_db"))
)

versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)
versioned_cache_directory <- cache_directory
versioned_cache_directory |> dir.create(
showWarnings = FALSE,
recursive = TRUE
Expand Down
4 changes: 2 additions & 2 deletions R/dev.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
#' @keywords internal
#' @return A character vector of the newly-created anndata files
#' @examples
#' \donttest{
#' \dontrun{
#' hdf5_to_anndata(
#' "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
#' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
Expand Down Expand Up @@ -194,7 +194,7 @@ hdf5_to_anndata <- function(input_directory, output_directory){
# @return A character vector of the newly-created anndata files
# @noRd
# @examples
# \donttest{
# \dontrun{
# h5seurat_to_anndata(
# "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
# "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
Expand Down
80 changes: 44 additions & 36 deletions R/metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,28 @@ NULL
#' Environment that we use to cache the DuckDB connections
#' @noRd
cache <- rlang::env(
metadata_table = rlang::env()
metadata_table = rlang::env()
)

#' URL pointing to the full metadata file
#' Returns the URLs for all metadata files
#' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
#' @export
#' @return A character scalar consisting of the URL
#' @return A character vector of URLs to parquet files to download
#' @examples
#' get_metadata(remote_url = DATABASE_URL)
DATABASE_URL <- single_line_str(
"https://object-store.rc.nectar.org.au/v1/
AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet"
)
#' get_database_url("metadata.0.2.3.parquet")
get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
glue::glue(
"https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}")
}

#' URL pointing to the sample metadata file, which is smaller and for test,
#' demonstration, and vignette purposes only
#' @export
#' @return A character scalar consisting of the URL
#' @examples
#' get_metadata(remote_url = SAMPLE_DATABASE_URL)
#' get_metadata(remote_url = SAMPLE_DATABASE_URL, cache_directory = tempdir())
SAMPLE_DATABASE_URL <- single_line_str(
"https://object-store.rc.nectar.org.au/v1/
"https://object-store.rc.nectar.org.au/v1/
AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/
sample_metadata.0.2.3.parquet"
)
Expand All @@ -38,8 +39,8 @@ SAMPLE_DATABASE_URL <- single_line_str(
#' into [get_single_cell_experiment()] to obtain a
#' [`SingleCellExperiment::SingleCellExperiment-class`]
#'
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
#' to the location of the parquet database.
#' @param remote_url Optional character vector of any length. HTTP URL/URLs pointing
#' to the name and location of parquet database/databases.
#' @param cache_directory Optional character vector of length 1. A file path on
#' your local system to a directory (not a file) that will be used to store
#' `metadata.parquet`
Expand Down Expand Up @@ -68,6 +69,7 @@ SAMPLE_DATABASE_URL <- single_line_str(
#' @importFrom httr progress
#' @importFrom cli cli_alert_info hash_sha256
#' @importFrom glue glue
#' @importFrom purrr walk
#'
#' @details
#'
Expand Down Expand Up @@ -142,32 +144,38 @@ SAMPLE_DATABASE_URL <- single_line_str(
#' get_metadata(cache_directory = path.expand('~'))
#'
get_metadata <- function(
remote_url = DATABASE_URL,
remote_url = get_database_url(),
cache_directory = get_default_cache_dir(),
use_cache = TRUE
) {
hash <- c(remote_url, cache_directory) |> paste0(collapse="") |>
hash_sha256()
cached_connection <- cache$metadata_table[[hash]]
if (!is.null(cached_connection) && isTRUE(use_cache)) {
cached_connection
}
else {
db_path <- file.path(cache_directory, "metadata.0.2.3.parquet")

if (!file.exists(db_path)){
report_file_sizes(remote_url)
sync_remote_file(
remote_url,
db_path,
progress(type = "down", con = stderr())
)
}

table <- duckdb() |>
dbConnect(drv = _, read_only = TRUE) |>
read_parquet(db_path)
cache$metadata_table[[hash]] <- table
table
# Synchronize remote files
walk(remote_url, function(url) {
# Calculate the file path from the URL
path <- file.path(cache_directory, url |> basename())
if (!file.exists(path)) {
report_file_sizes(url)
sync_remote_file(url,
path,
progress(type = "down", con = stderr()))
}
})
all_parquet <- file.path(cache_directory, dir(cache_directory, pattern = ".parquet$"))
# We try to avoid re-reading a set of parquet files
# that is identical to a previous set by hashing the file list
hash <- all_parquet |> paste0(collapse="") |>
hash_sha256()
cached_connection <- cache$metadata_table[[hash]]

if (!is.null(cached_connection) && isTRUE(use_cache)) {
# If the file list is identical, just re-use the database table
cached_connection
}
else {
table <- duckdb() |>
dbConnect(drv = _, read_only = TRUE) |>
read_parquet(path = all_parquet)
cache$metadata_table[[hash]] <- table
table
}
}

3 changes: 1 addition & 2 deletions R/unharmonised.R
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ UNHARMONISED_URL <- single_line_str(
#' @return A named list, where each name is a dataset file ID, and each value is
#' a "lazy data frame", ie a `tbl`.
#' @examples
#' \donttest{
#' \dontrun{
#' dataset <- "838ea006-2369-4e2c-b426-b2a744a2b02b"
#' harmonised_meta <- get_metadata() |>
#' dplyr::filter(file_id == dataset) |> dplyr::collect()
Expand All @@ -54,7 +54,6 @@ get_unharmonised_dataset <- function(
){
unharmonised_root <- file.path(
cache_directory,
COUNTS_VERSION,
"unharmonised"
)
file_name <- glue::glue("{dataset_id}.parquet")
Expand Down
6 changes: 4 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ single_line_str <- function(text){
str_remove_all(text, r"(\n\s*)")
}

#' Returns the default cache directory
#' Returns the default cache directory with a version number
#' @return A length one character vector.
#' @importFrom tools R_user_dir
#' @importFrom utils packageName
Expand All @@ -51,6 +51,7 @@ get_default_cache_dir <- function() {
R_user_dir(
"cache"
) |>
file.path(COUNTS_VERSION) |>
normalizePath() |>
suppressWarnings()
}
Expand Down Expand Up @@ -89,10 +90,11 @@ sync_remote_file <- function(full_url, output_file, ...) {
#' @importFrom glue glue
#' @importFrom dplyr tbl
#' @importFrom dbplyr sql
#' @importFrom glue glue_sql
#' @return An SQL data frame
#' @keywords internal
read_parquet <- function(conn, path){
from_clause <- glue("FROM read_parquet('{path}')") |> sql()
from_clause <- glue_sql("FROM read_parquet([{`path`*}], union_by_name=true)", .con=conn) |> sql()
tbl(conn, from_clause)
}

Expand Down
22 changes: 0 additions & 22 deletions man/DATABASE_URL.Rd

This file was deleted.

2 changes: 1 addition & 1 deletion man/SAMPLE_DATABASE_URL.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 22 additions & 0 deletions man/get_database_url.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions man/get_default_cache_dir.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions man/get_metadata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/get_unharmonised_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/hdf5_to_anndata.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 17 additions & 2 deletions tests/testthat/test-query.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ test_that("get_default_cache_dir() returns the correct directory on Linux", {
grepl("linux", version$platform, fixed = TRUE) |>
skip_if_not()

"~/.cache/R/CuratedAtlasQueryR" |>
"~/.cache/R/CuratedAtlasQueryR/0.2.1" |>
normalizePath() |>
expect_equal(
get_default_cache_dir(),
Expand Down Expand Up @@ -131,7 +131,7 @@ test_that("get_SingleCellExperiment() assigns the right cell ID to each cell", {

# Load the SCE from cache directly
assay_1 = CuratedAtlasQueryR:::get_default_cache_dir() |>
file.path(CuratedAtlasQueryR:::COUNTS_VERSION, "original", id) |>
file.path("original", id) |>
HDF5Array::loadHDF5SummarizedExperiment() |>
assay("X") |>
as.matrix()
Expand Down Expand Up @@ -190,3 +190,18 @@ test_that("get_metadata() is cached", {

identical(table, table_2) |> expect_true()
})

test_that("database_url() expect character ", {
get_database_url() |>
expect_s3_class("character")
})


test_that("get_metadata() expect a unique cell_type `b` is present, which comes from fibrosis database", {
n_cell <- get_metadata() |> filter(cell_type_harmonised == 'b') |> as_tibble() |> nrow()
expect_true(n_cell > 0)
})




Loading

0 comments on commit d645638

Please sign in to comment.