Skip to content

Commit

Permalink
import DBI and duckdb,
Browse files Browse the repository at this point in the history
  • Loading branch information
rafapereirabr committed Sep 18, 2024
1 parent 8254c17 commit 4a968de
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 118 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,15 @@ Imports:
arrow (>= 15.0.1),
checkmate,
curl (>= 5.0.0),
DBI,
dplyr,
duckdb,
duckplyr,
fs,
tools
Suggests:
covr,
DBI,
dbplyr,
duckdb,
geobr,
ggplot2 (>= 3.3.1),
rmarkdown,
Expand Down
128 changes: 19 additions & 109 deletions R/merge_household.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,22 @@ merge_household_var <- function(df,
df_household <- dplyr::filter(df_household, get(key_key) %in% key_values) |>
dplyr::compute()

#### https://github.com/duckdb/duckdb-r/issues/72

# convert to duckdb
# df <- arrow::to_duckdb(df)
# df_household <- arrow::to_duckdb(df_household)
df <- arrow::to_duckdb(df)
df_household <- arrow::to_duckdb(df_household)

# register db connection
con <- DBI::dbConnect(
duckdb::duckdb(), ":memory:", read_only = FALSE,
config=list("temp_directory" = fs::path_temp())
)

con <- duckdb::dbConnect(duckdb::duckdb(), read_only = FALSE)
duckdb::duckdb_register_arrow(con, 'df', df)
duckdb::duckdb_register_arrow(con, 'df_household', df_household)
# # config db connection
# pragmas <- paste0(
# "PRAGMA memory_limit='32GB'; PRAGMA temp_directory='", tempdir(), "';")
# dbExecute(conn = con, pragmas)

# limit RAM and threads of duckdb ???
# DBI::dbExecute(con, "PRAGMA threads=1; PRAGMA memory_limit='1GB';")
Expand All @@ -89,6 +98,10 @@ merge_household_var <- function(df,
# https://github.com/duckdb/duckdb-r/issues/83
# https://github.com/duckdb/duckdb-r/issues/72

# register data to db
duckdb::duckdb_register(con, 'df', df)
duckdb::duckdb_register(con, 'df_household', df_household)

# merge
df_geo <- duckplyr::left_join(dplyr::tbl(con, "df"),
dplyr::tbl(con, "df_household"),
Expand All @@ -102,112 +115,9 @@ merge_household_var <- function(df,
# remove duckdb instance
duckdb::duckdb_unregister_arrow(con, 'df')
duckdb::duckdb_unregister_arrow(con, 'df_household')
duckdb::dbDisconnect(con, shutdown = TRUE)
DBI::dbDisconnect(con, shutdown = TRUE)
rm(con)
gc()

return(df_geo)
}
# merge_household_var <- function(df,
# year = parent.frame()$year,
# add_labels = parent.frame()$add_labels,
# showProgress = parent.frame()$showProgress){
#
# # download household data
# df_household <- censobr::read_households(year = year,
# add_labels = add_labels,
# as_data_frame = FALSE,
# showProgress = showProgress)
#
# # set vars to merge
# if (year == 1970) {
# key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
# 'code_region', 'name_region', 'id_household')
# key_key <- 'id_household'
# }
#
# if (year == 1980) {
# key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
# 'code_region', 'name_region', 'V6', 'V601')
# key_key <- 'V601'
#
# # rename weight var
# df_household <- dplyr::rename(df_household, 'V603_household' = 'V603')
# }
#
# if (year == 1991) {
# key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
# 'code_region', 'name_region', 'V0109')
#
# key_key <- 'V0109'
# # rename weight var
# df_household <- dplyr::rename(df_household, 'V7300_household' = 'V7300')
# }
#
# if (year == 2000) {
# key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
# 'code_region', 'name_region', 'code_weighting', 'V0300')
# key_key <- 'V0300'
# }
#
# if (year == 2010) {
# key_vars <- c('code_muni', 'code_state', 'abbrev_state','name_state',
# 'code_region', 'name_region', 'code_weighting', 'V0300')
#
# key_key <- 'V0300'
# # rename weight var
# df_household <- dplyr::rename(df_household, 'V0010_household' = 'V0010') |>
# dplyr::compute()
# }
#
#
# # drop repeated vars
# all_common_vars <- names(df)[names(df) %in% names(df_household)]
# vars_to_drop <- setdiff(all_common_vars, key_vars)
# df_household <- dplyr::select(df_household, -all_of(vars_to_drop)) |>
# dplyr::compute()
#
# # # pre-filter right-hand table that matches key in left-hand table
# # this improves performance a bit
# df <- dplyr::compute(df)
# key_values <- as.vector(unique(df$GetColumnByName(key_key)))
# df_household <- dplyr::filter(df_household, get(key_key) %in% key_values) |>
# dplyr::compute()
#
#
# # con <- DBI::dbConnect(duckdb::duckdb(), read_only = FALSE)
# con <- duckdb::dbConnect(duckdb::duckdb(), read_only = FALSE, dbdir = "duckdb")
#
# duckdb::duckdb_register_arrow(con, 'df', df)
# duckdb::duckdb_register_arrow(con, 'df_household', df_household)
#
# # limit RAM and threads of duckdb ???
# # dbExecute(con, "PRAGMA threads=1; PRAGMA memory_limit='1GB';")
# # dbExecute(conn = conn, paste0("PRAGMA memory_limit='12GB'"))
# # appears to work.
# # https://github.com/duckdb/duckdb-r/issues/83
# # https://github.com/duckdb/duckdb-r/issues/72
#
# df_db <- dplyr::tbl(con, "df")
# df_household_db <- dplyr::tbl(con, "df_household")
#
# # merge
# df_geo <- dplyr::left_join(
# x = df_db,
# y = df_household_db,
# by = key_vars
# )
# df_geo <- dplyr::compute(df_geo)
#
# # back to arrow
# df_geo <- arrow::to_arrow(df_geo)
#
# # remove duckdb instance
# duckdb::duckdb_unregister_arrow(con, 'df')
# duckdb::duckdb_unregister_arrow(con, 'df_household')
# duckdb::dbDisconnect(con, shutdown = TRUE)
#
# on.exit(duckdb::dbDisconnect(con, shutdown = TRUE))
#
# return(df_geo)
# }
4 changes: 2 additions & 2 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ download_file <- function(file_url = parent.frame()$file_url,
cache_message(local_file, cache)

# this is necessary to silence download message when reading local file
if(file.exists(local_file) & isTRUE(cache)){
showProgress <- FALSE
if (file.exists(local_file) & isTRUE(cache)) {
return(local_file)
}

# download files
Expand Down
2 changes: 1 addition & 1 deletion vignettes/censobr.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ The package currently includes 6 main functions to download census data:
<th rowspan="2">Origem</th>
<th rowspan="2">Unidade</th>
<th rowspan="2">Definição</th>
<th colspan="6">Disponibilidade</th>
<th colspan="7">Disponibilidade</th>
</tr>
<tr>
<th>1960</th>
Expand Down
9 changes: 5 additions & 4 deletions vignettes/larger_than_memory.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ Because of the seamless integration between `{arrow}` and `{dplyr}`, Arrow tabl
Without calling one of these, the query is just prepared but not executed, which is useful for delaying heavy computations until needed.
In the example below, we create a new Arrow table that only includes the deaths records of men in the state of Rio de Janeiro without loading the data to memory. Note that we only piece of data we `collect()` (i.e. load to memory) here are the first 10 observation of the data.
In the example below, we create a new Arrow table that only includes the deaths records of men in the state of Rio de Janeiro without loading the data to memory. Note that we only piece of data we `collect()` (i.e. load to memory) here are the first observations of the data.
```{r warning = FALSE, message=FALSE}
library(dplyr)
Expand All @@ -54,7 +54,7 @@ library(dplyr)
rio <- df |>
filter(V0704 == 'Masculino' & abbrev_state == 'RJ')
head(rio, n = 10) |>
head(rio) |>
collect()
```
Expand Down Expand Up @@ -108,11 +108,12 @@ head(rio2)
A third alternative is the new `{duckplyr}` package. This this library is under development so it still does not cover many of the `{dplyr}`verbs. (See [here](https://duckdb.org/2024/04/02/duckplyr.html)). Nonetheless, it looks like a really promising and seamless approach that allow users to leverage the powerful capabilities of {duckdb}.
```{r warning = FALSE, eval = FALSE}
```{r warning = FALSE, message = FALSE}
library(duckplyr)
rio3 <- df |>
duckplyr::filter(V0704 == 'Masculino' & abbrev_state == 'RJ')
head(rio3) |> collect()
head(rio3) |>
collect()
```

0 comments on commit 4a968de

Please sign in to comment.