From 81e906bea9d7512b6ca046167f727b1d779f6f7e Mon Sep 17 00:00:00 2001 From: Alexandros Kouretsis Date: Thu, 19 Sep 2024 22:07:23 +0000 Subject: [PATCH 01/18] HPC vs reproducibility post (#195) --- .../zzz_DO_NOT_EDIT_the__tensio.../appendix.R | 73 +++++++++ posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt | 3 + ...__reproducibility_vs.__parallelization.qmd | 155 ++++++++++++++++++ 3 files changed, 231 insertions(+) create mode 100644 posts/zzz_DO_NOT_EDIT_the__tensio.../appendix.R create mode 100644 posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt create mode 100644 posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../appendix.R b/posts/zzz_DO_NOT_EDIT_the__tensio.../appendix.R new file mode 100644 index 00000000..c69926b0 --- /dev/null +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../appendix.R @@ -0,0 +1,73 @@ +suppressMessages(library(dplyr)) +# markdown helpers -------------------------------------------------------- + +markdown_appendix <- function(name, content) { + paste(paste("##", name, "{.appendix}"), " ", content, sep = "\n") +} +markdown_link <- function(text, path) { + paste0("[", text, "](", path, ")") +} + + + +# worker functions -------------------------------------------------------- + +insert_source <- function(repo_spec, name, + collection = "posts", + branch = "main", + host = "https://github.com", + text = "Source", + file_name) { + path <- paste( + host, + repo_spec, + "tree", + branch, + collection, + name, + file_name, + sep = "/" + ) + return(markdown_link(text, path)) +} + +insert_timestamp <- function(tzone = Sys.timezone()) { + time <- lubridate::now(tzone = tzone) + stamp <- as.character(time, tz = tzone, usetz = TRUE) + return(stamp) +} + +insert_lockfile <- function(repo_spec, name, + collection = "posts", + branch = "main", + host = "https://github.com", + text = "Session info") { + path <- path <- "https://pharmaverse.github.io/blog/session_info.html" + + return(markdown_link(text, path)) +} + + + +# top level function ------------------------------------------------------ + +insert_appendix <- function(repo_spec, name, collection = "posts", file_name) { + appendices <- paste( + markdown_appendix( + name = "Last updated", + content = insert_timestamp() + ), + " ", + markdown_appendix( + name = "Details", + content = paste( + insert_source(repo_spec, name, collection, file_name = file_name), + # get renv information, + insert_lockfile(repo_spec, name, collection), + sep = ", " + ) + ), + sep = "\n" + ) + knitr::asis_output(appendices) +} diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt b/posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt new file mode 100644 index 00000000..65f1f6da --- /dev/null +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt @@ -0,0 +1,3 @@ +mutate: new variable 'b' (character) with one unique value and 0% NA +mutate: new variable 'a' (character) with one unique value and 0% NA +mutate: new variable 'c' (character) with one unique value and 0% NA diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd new file mode 100644 index 00000000..f8332ec9 --- /dev/null +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -0,0 +1,155 @@ +--- +title: "The Tension of High-Performance Computing: Reproducibility vs. Parallelization" +author: + - name: Alexandros Kouretsis +description: "" +# Note that the date below will be auto-updated when the post is merged. +date: "2024-12-01" +# Please do not use any non-default categories. +# You can find the default categories in the repository README.md +categories: [Submissions, Technical] +# Feel free to change the image +image: "pharmaverse.png" + +--- + + + +```{r setup, include=FALSE} +long_slug <- "zzz_DO_NOT_EDIT_the__tensio..." +# renv::use(lockfile = "renv.lock") +``` + + + +## Harnessing HPC for Drug Development + +In the world of pharmaceutical research, high-performance computing (HPC) plays a pivotal role in driving advancements in drug discovery and development. From analyzing vast genomic datasets to simulating drug interactions across diverse populations, HPC enables researchers to tackle complex computational tasks at +high speeds. As pharmaceutical research becomes increasingly data-driven, the need for powerful computational tools has grown, allowing for more accurate predictions, faster testing, and more efficient processes. However, with the growing complexity and scale of these simulations, ensuring reproducibility of results becomes a significant challenge. + +In this blog post, we will explore common reproducibility challenges in drug development and simulations, focusing on how the `{mirai}` package can be used as a backend solution to effectively manage parallelization. + +## The Problem: Reproducibility in Parallel Simulations + +Imagine a research team at the forefront of developing a new drug. They use sophisticated simulations to predict how the drug will perform across different patient cohorts. To manage the large computational workload, the team employs parallel processing, distributing the simulation tasks across multiple processors. This approach significantly speeds up the process, allowing them to handle vast datasets efficiently. + +However, the team soon encounters a problem. Each time they rerun the simulations, the results differ slightly, even though they use the same input parameters. This inconsistency raises a red flag: *their results are not reproducible.* In the pharmaceutical industry, where accuracy and reliability are paramount, this is a serious issue. Reproducibility is not just a scientific ideal; it's a regulatory requirement. + +Upon investigation, the team discovers that the variability in their results is due to the way tasks are parallelized across processors. The order in which operations are executed can differ slightly between runs, leading to small but significant variations in the outcomes. These differences are particularly problematic when they accumulate over thousands of iterations, making it difficult to ensure that the simulation results can be consistently reproduced by others. + +### Tracking Operations in Parallel Computing + +Let’s explore a simple scenario where parallelization creates confusion in tracking operations due to the asynchronous nature of task execution and logging. + +```{r, message=FALSE, eval=FALSE} +library("mirai") +library("dplyr", warn.conflicts = FALSE) + +# start parallel workers +daemons(4) + +# load libraries on each worker and set up logging to a file +everywhere({ + library("dplyr") + library("tidyr") + library("mirai") + library("tidylog", warn.conflicts = FALSE) + + # Define function to log messages to the log file + log_to_file <- \(txt) cat(txt, file = log_file, sep = "\n", append = TRUE) + options("tidylog.display" = list(message, log_to_file)) +}, log_file = "log.txt") + +m <- mirai_map(letters[1:3], \(x) { + mutate(tibble(.rows = 1), "{x}" := "foo") +}) + +result <- m[] |> dplyr::bind_cols() + +daemons(0) + +return(result) +``` + +In the above code chunk, we set up a parallel processing environment using the `{mirai}` package. The function `mirai_map()` is used to apply a mutating function in parallel to a tibble for each element of `letters[1:3]`, logging the operations to a file using the `{tidylog}` package. However, while we can log each operation as it happens, due to the parallel nature of `{mirai}`, the logging does not occur in a controlled or sequential order. +*Each daemon executes its task independently, and the order of logging in the file will depend on the completion times of these parallel processes rather than the intended flow of operations.* + +> Parallel computations can obscure the traceability of operations + +This lack of control can lead to a situation where the log entries do not reflect the actual sequence in which the `{dplyr}` commands were expected to be processed. Although the operations themselves are carried out correctly, the asynchronous logging may create challenges in *tracing* and *debugging* the process, as entries in the log file could appear out of order, giving an incomplete or misleading representation of the task flow. + +```{r} +readLines("log.txt") +``` + +In the above code, when we read the contents of the log file, you will notice that the logs are not in the same order as the commands were dispatched. This demonstrates the inherent difficulty in managing the order of logging in parallel tasks, especially when there is no guarantee on how quickly each process will complete and record its operations. + +### Task Dispatching and RNG Management + +By default, `{mirai}` uses an advanced dispatcher to manage task distribution efficiently, scheduling tasks in a First-In-First-Out manner and leveraging `{nanonext}` primitives for zero-latency, resource-free task management. However, its asynchronous execution can hinder reproducibility, especially with random number generation (RNG) or tasks needing strict order. + +To enhance reproducibility, `{mirai}` allows disabling the dispatcher, directly connecting the host to daemons in a round-robin fashion. While less efficient, this approach gives more control over task execution and is better suited for ensuring consistent RNG and reproducible results. + +```{r} +library(mirai) +library(dplyr, warn.conflicts = FALSE) + +# Parameters for the simulation +cohorts <- tribble( + ~patient_count, ~mean_effect, ~sd_effect, + 1000, 0.7, 0.1, + 1000, 0.65, 0.15, + 1000, 0.75, 0.05 +) + +# Start daemons with consistent RNG streams +x <- daemons(4, dispatcher = FALSE, seed = 123) + +# Parallel simulation for each row of the cohorts table +m <- mirai_map(cohorts, \(patient_count, mean_effect, sd_effect) { + dplyr::tibble( + patient_id = 1:patient_count, + efficacy = rnorm(patient_count, mean = mean_effect, sd = sd_effect) + ) +}) + +results <- m[] |> dplyr::bind_rows() + +x <- daemons(0, dispatcher = FALSE) + +results %>% + group_by(patient_id) %>% + summarise( + mean_efficacy = mean(efficacy), + sd_efficacy = sd(efficacy) + ) +``` + +In this example, we use `tribble` to define the simulation parameters and initialize 4 daemons with dispatcher = FALSE and a fixed seed to ensure consistent random number generation across tasks. The `mirai_map()` function parallelizes the drug efficacy simulation, and the results are combined using `bind_rows()` for further analysis. Disabling the dispatcher gives more control over task execution, ensuring reproducibility. If you repeat the +computation you will notice that it generates consistent results. + +However, this approach comes at a cost. Disabling the dispatcher may lead to inefficient resource utilization when tasks are unevenly distributed, as some daemons may remain idle. While reproducibility is prioritized, we sacrifice some performance, especially when handling tasks with varying workloads. + +> "In prioritizing reproducibility, we inevitably sacrifice some performance, especially when tasks with unequal workloads are distributed across daemons." + +Reproducibility becomes more complex when using parallelization frameworks like `{parallelMap}`, `{doFuture}`, and `{future}`, as each handles random number generation (RNG) differently. While `set.seed()` is sufficient for sequential tasks, parallel computations require managing RNG streams carefully, often using types like "L'Ecuyer-CMRG" or functions such as `clusterSetRNGStream()` for synchronization. Each framework requires specific adjustments to ensure consistent results, emphasizing the importance of understanding how each backend manages RNG in parallel environments. + +## Closing Thoughts + +While we've explored the basics of reproducibility in parallel computing with simple examples, the challenges extend beyond random number generation. Issues such as process synchronization, using tools like lock files, become critical in multi-process environments. Floating-point arithmetic adds complexity, particularly when distributed across heterogeneous systems with varying architectures and precision. Managing dependencies also becomes more intricate as tasks grow in complexity, and ensuring error recovery in a controlled manner is vital to avoid crashes or inconsistent results in large-scale operations. + +Powerful tools like `{targets}` and `{crew}` can help tackle these advanced challenges. `{targets}` is a workflow orchestration tool that manages dependencies, automates reproducible pipelines, and ensures consistent results across runs. Meanwhile, `{crew}` extends this by efficiently managing distributed computing tasks, allowing for seamless scaling, load balancing, and error handling across local processes or cloud environments. Together, these tools simplify the execution of complex high-performance computing (HPC) workflows, providing flexibility and robustness for scaling computations while trying for maintaining control and reproducibility. + +This blog post has hopefully increased your intuition about the challenges that may arise when incorporating HPC into your work. By understanding these complexities, you’ll be better positioned to make informed decisions about the trade-offs—such as balancing performance and reproducibility — that are most relevant to your specific case. As your computations scale, finding the right balance between efficiency, accuracy, and reproducibility will be crucial for the success of your projects. + + + +```{r, echo=FALSE} +source("appendix.R") +insert_appendix( + repo_spec = "pharmaverse/blog", + name = long_slug, + # file_name should be the name of your file + file_name = list.files() %>% stringr::str_subset(".qmd") %>% first() +) +``` From 7efff1207e3a99299f4100bdd60f0dfefd73df8b Mon Sep 17 00:00:00 2001 From: alekoure Date: Mon, 23 Sep 2024 15:52:26 +0000 Subject: [PATCH 02/18] Improve intro text and text wrap --- ...__reproducibility_vs.__parallelization.qmd | 152 ++++++++++++++---- 1 file changed, 120 insertions(+), 32 deletions(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index f8332ec9..60020225 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -24,22 +24,48 @@ long_slug <- "zzz_DO_NOT_EDIT_the__tensio..." ## Harnessing HPC for Drug Development -In the world of pharmaceutical research, high-performance computing (HPC) plays a pivotal role in driving advancements in drug discovery and development. From analyzing vast genomic datasets to simulating drug interactions across diverse populations, HPC enables researchers to tackle complex computational tasks at -high speeds. As pharmaceutical research becomes increasingly data-driven, the need for powerful computational tools has grown, allowing for more accurate predictions, faster testing, and more efficient processes. However, with the growing complexity and scale of these simulations, ensuring reproducibility of results becomes a significant challenge. - -In this blog post, we will explore common reproducibility challenges in drug development and simulations, focusing on how the `{mirai}` package can be used as a backend solution to effectively manage parallelization. - -## The Problem: Reproducibility in Parallel Simulations - -Imagine a research team at the forefront of developing a new drug. They use sophisticated simulations to predict how the drug will perform across different patient cohorts. To manage the large computational workload, the team employs parallel processing, distributing the simulation tasks across multiple processors. This approach significantly speeds up the process, allowing them to handle vast datasets efficiently. - -However, the team soon encounters a problem. Each time they rerun the simulations, the results differ slightly, even though they use the same input parameters. This inconsistency raises a red flag: *their results are not reproducible.* In the pharmaceutical industry, where accuracy and reliability are paramount, this is a serious issue. Reproducibility is not just a scientific ideal; it's a regulatory requirement. - -Upon investigation, the team discovers that the variability in their results is due to the way tasks are parallelized across processors. The order in which operations are executed can differ slightly between runs, leading to small but significant variations in the outcomes. These differences are particularly problematic when they accumulate over thousands of iterations, making it difficult to ensure that the simulation results can be consistently reproduced by others. +In pharmaceutical research, high-performance computing (HPC) plays a pivotal +role in driving advancements in drug discovery and development. From analyzing +vast genomic datasets to simulating drug interactions across diverse +populations, HPC enables researchers to tackle complex computational tasks at +high speeds. As pharmaceutical research becomes increasingly data-driven, the +need for powerful computational tools has grown, allowing for more accurate +predictions, faster testing, and more efficient processes. However, with the +growing complexity and scale of these computations, ensuring reproducibility +of results becomes a significant challenge. + +In this blog post, we will explore common reproducibility challenges in drug +development and simulations, using the `{mirai}` package as a backend solution +to manage parallelization. + +## The Problem: Reproducibility in Parallel Processing + +Imagine a research team working on a cutting-edge drug development project. To +process and analyze vast amounts of data efficiently, they leverage parallel +processing, distributing tasks across multiple processors. This approach +significantly accelerates their work, enabling them to handle large datasets and +complex computations in a fraction of the time. + +However, the team soon encounters an issue. Each time they rerun the same +processing tasks with identical input parameters, the results differ slightly. +This raises a major concern: *the results are not reproducible.* In industries +like pharmaceuticals, where accuracy and consistency are critical, +reproducibility is not just important—it's a regulatory requirement. + +For example, in large-scale Monte Carlo simulations, small differences can arise +not only from changes in execution order across processors but also from +inconsistencies between workers or difficulties in maintaining synchronized +random number generation (RNG) streams. Furthermore, the more complex the +environment—with multiple components such as distributed workers, different +hardware, or varying system configurations—the harder it becomes to reprovision +the exact same environment and repeat the computations exactly. As these +variations accumulate, ensuring consistent and reproducible results becomes +a significant challenge in data-driven research. ### Tracking Operations in Parallel Computing -Let’s explore a simple scenario where parallelization creates confusion in tracking operations due to the asynchronous nature of task execution and logging. +Let’s explore a simple scenario where parallelization creates confusion in +tracking operations due to the asynchronous nature of task execution and logging. ```{r, message=FALSE, eval=FALSE} library("mirai") @@ -71,24 +97,47 @@ daemons(0) return(result) ``` -In the above code chunk, we set up a parallel processing environment using the `{mirai}` package. The function `mirai_map()` is used to apply a mutating function in parallel to a tibble for each element of `letters[1:3]`, logging the operations to a file using the `{tidylog}` package. However, while we can log each operation as it happens, due to the parallel nature of `{mirai}`, the logging does not occur in a controlled or sequential order. -*Each daemon executes its task independently, and the order of logging in the file will depend on the completion times of these parallel processes rather than the intended flow of operations.* +In the above code chunk, we set up a parallel processing environment using the +`{mirai}` package. The function `mirai_map()` is used to apply a mutating +function in parallel to a tibble for each element of `letters[1:3]`, logging the +operations to a file using the `{tidylog}` package. However, while we can log +each operation as it happens, due to the parallel nature of `{mirai}`, the +logging does not occur in a controlled or sequential order. *Each daemon +executes its task independently, and the order of logging in the file will +depend on the completion times of these parallel processes rather than the +intended flow of operations.* > Parallel computations can obscure the traceability of operations -This lack of control can lead to a situation where the log entries do not reflect the actual sequence in which the `{dplyr}` commands were expected to be processed. Although the operations themselves are carried out correctly, the asynchronous logging may create challenges in *tracing* and *debugging* the process, as entries in the log file could appear out of order, giving an incomplete or misleading representation of the task flow. +This lack of control can lead to a situation where the log entries do not +reflect the actual sequence in which the `{dplyr}` commands were expected to be +processed. Although the operations themselves are carried out correctly, the +asynchronous logging may create challenges in *tracing* and *debugging* the +process, as entries in the log file could appear out of order, giving an +incomplete or misleading representation of the task flow. ```{r} readLines("log.txt") ``` -In the above code, when we read the contents of the log file, you will notice that the logs are not in the same order as the commands were dispatched. This demonstrates the inherent difficulty in managing the order of logging in parallel tasks, especially when there is no guarantee on how quickly each process will complete and record its operations. +In the above code, when we read the contents of the log file, you will notice +that the logs are not in the same order as the commands were dispatched. This +demonstrates the inherent difficulty in managing the order of logging in +parallel tasks, especially when there is no guarantee on how quickly each +process will complete and record its operations. ### Task Dispatching and RNG Management -By default, `{mirai}` uses an advanced dispatcher to manage task distribution efficiently, scheduling tasks in a First-In-First-Out manner and leveraging `{nanonext}` primitives for zero-latency, resource-free task management. However, its asynchronous execution can hinder reproducibility, especially with random number generation (RNG) or tasks needing strict order. +By default, `{mirai}` uses an advanced dispatcher to manage task distribution +efficiently, scheduling tasks in a First-In-First-Out manner and leveraging +`{nanonext}` primitives for zero-latency, resource-free task management. +However, its asynchronous execution can hinder reproducibility, especially with +random number generation (RNG) or tasks needing strict order. -To enhance reproducibility, `{mirai}` allows disabling the dispatcher, directly connecting the host to daemons in a round-robin fashion. While less efficient, this approach gives more control over task execution and is better suited for ensuring consistent RNG and reproducible results. +To enhance reproducibility, `{mirai}` allows disabling the dispatcher, directly +connecting the host to daemons in a round-robin fashion. While less efficient, +this approach gives more control over task execution and is better suited for +ensuring consistent RNG and reproducible results. ```{r} library(mirai) @@ -125,22 +174,61 @@ results %>% ) ``` -In this example, we use `tribble` to define the simulation parameters and initialize 4 daemons with dispatcher = FALSE and a fixed seed to ensure consistent random number generation across tasks. The `mirai_map()` function parallelizes the drug efficacy simulation, and the results are combined using `bind_rows()` for further analysis. Disabling the dispatcher gives more control over task execution, ensuring reproducibility. If you repeat the -computation you will notice that it generates consistent results. - -However, this approach comes at a cost. Disabling the dispatcher may lead to inefficient resource utilization when tasks are unevenly distributed, as some daemons may remain idle. While reproducibility is prioritized, we sacrifice some performance, especially when handling tasks with varying workloads. - -> "In prioritizing reproducibility, we inevitably sacrifice some performance, especially when tasks with unequal workloads are distributed across daemons." - -Reproducibility becomes more complex when using parallelization frameworks like `{parallelMap}`, `{doFuture}`, and `{future}`, as each handles random number generation (RNG) differently. While `set.seed()` is sufficient for sequential tasks, parallel computations require managing RNG streams carefully, often using types like "L'Ecuyer-CMRG" or functions such as `clusterSetRNGStream()` for synchronization. Each framework requires specific adjustments to ensure consistent results, emphasizing the importance of understanding how each backend manages RNG in parallel environments. +In this example, we use `tribble` to define the simulation parameters and +initialize 4 daemons with dispatcher = FALSE and a fixed seed to ensure +consistent random number generation across tasks. The `mirai_map()` function +parallelizes the drug efficacy simulation, and the results are combined using +`bind_rows()` for further analysis. Disabling the dispatcher gives more control +over task execution, ensuring reproducibility. If you repeat the computation you +will notice that it generates consistent results. + +However, this approach comes at a cost. Disabling the dispatcher may lead to +inefficient resource utilization when tasks are unevenly distributed, as some +daemons may remain idle. While reproducibility is prioritized, we sacrifice +some performance, especially when handling tasks with varying workloads. + +> "In prioritizing reproducibility, we inevitably sacrifice some performance, +especially when tasks with unequal workloads are distributed across daemons." + +Reproducibility becomes more complex when using parallelization frameworks +like `{parallelMap}`, `{doFuture}`, and `{future}`, as each handles random +number generation (RNG) differently. While `set.seed()` is sufficient for +sequential tasks, parallel computations require managing RNG streams carefully, +often using types like "L'Ecuyer-CMRG" or functions such as +`clusterSetRNGStream()` for synchronization. Each framework requires +specific adjustments to ensure consistent results, emphasizing the +importance of understanding how each backend manages RNG in parallel +environments. ## Closing Thoughts -While we've explored the basics of reproducibility in parallel computing with simple examples, the challenges extend beyond random number generation. Issues such as process synchronization, using tools like lock files, become critical in multi-process environments. Floating-point arithmetic adds complexity, particularly when distributed across heterogeneous systems with varying architectures and precision. Managing dependencies also becomes more intricate as tasks grow in complexity, and ensuring error recovery in a controlled manner is vital to avoid crashes or inconsistent results in large-scale operations. - -Powerful tools like `{targets}` and `{crew}` can help tackle these advanced challenges. `{targets}` is a workflow orchestration tool that manages dependencies, automates reproducible pipelines, and ensures consistent results across runs. Meanwhile, `{crew}` extends this by efficiently managing distributed computing tasks, allowing for seamless scaling, load balancing, and error handling across local processes or cloud environments. Together, these tools simplify the execution of complex high-performance computing (HPC) workflows, providing flexibility and robustness for scaling computations while trying for maintaining control and reproducibility. - -This blog post has hopefully increased your intuition about the challenges that may arise when incorporating HPC into your work. By understanding these complexities, you’ll be better positioned to make informed decisions about the trade-offs—such as balancing performance and reproducibility — that are most relevant to your specific case. As your computations scale, finding the right balance between efficiency, accuracy, and reproducibility will be crucial for the success of your projects. +While we've explored the basics of reproducibility in parallel computing with +simple examples, the challenges extend beyond random number generation. Issues +such as process synchronization, using tools like lock files, become critical +in multi-process environments. Floating-point arithmetic adds complexity, +particularly when distributed across heterogeneous systems with varying +architectures and precision. Managing dependencies also becomes more +intricate as tasks grow in complexity, and ensuring error recovery in a +controlled manner is vital to avoid crashes or inconsistent results in +large-scale operations. + +Powerful tools like `{targets}` and `{crew}` can help tackle these advanced +challenges. `{targets}` is a workflow orchestration tool that manages +dependencies, automates reproducible pipelines, and ensures consistent results +across runs. Meanwhile, `{crew}` extends this by efficiently managing +distributed computing tasks, allowing for seamless scaling, load balancing, +and error handling across local processes or cloud environments. Together, these +tools simplify the execution of complex high-performance computing (HPC) +workflows, providing flexibility and robustness for scaling computations while +trying for maintaining control and reproducibility. + +This blog post has hopefully increased your intuition about the challenges that +may arise when incorporating HPC into your work. By understanding these +complexities, you’ll be better positioned to make informed decisions about the +trade-offs—such as balancing performance and reproducibility — that are most +relevant to your specific case. As your computations scale, finding the right +balance between efficiency, accuracy, and reproducibility will be crucial for +the success of your projects. From 93dada623090d80a42ff6d4b1fd24f6ad5862c3b Mon Sep 17 00:00:00 2001 From: alekoure Date: Tue, 24 Sep 2024 17:51:05 +0000 Subject: [PATCH 03/18] Adding links to resources --- ...__reproducibility_vs.__parallelization.qmd | 45 ++++++++++++------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index 60020225..195c0bf5 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -35,8 +35,9 @@ growing complexity and scale of these computations, ensuring reproducibility of results becomes a significant challenge. In this blog post, we will explore common reproducibility challenges in drug -development and simulations, using the `{mirai}` package as a backend solution -to manage parallelization. +development and simulations, using the +[`{mirai}`](https://shikokuchuo.net/mirai/) package as a backend solution to +manage parallelization. ## The Problem: Reproducibility in Parallel Processing @@ -64,8 +65,12 @@ a significant challenge in data-driven research. ### Tracking Operations in Parallel Computing -Let’s explore a simple scenario where parallelization creates confusion in -tracking operations due to the asynchronous nature of task execution and logging. +Let’s explore a simple scenario where parallelization creates confusion in +tracking operations due to the asynchronous nature of task execution and +logging. For this, we will also use the +[`{tidylog}`](https://github.com/elbersb/tidylog) package, which tracks and logs +`{dplyr}` operations, providing insight into how the computations are executed +across multiple workers. ```{r, message=FALSE, eval=FALSE} library("mirai") @@ -86,10 +91,12 @@ everywhere({ options("tidylog.display" = list(message, log_to_file)) }, log_file = "log.txt") +# perform computations in parallel m <- mirai_map(letters[1:3], \(x) { mutate(tibble(.rows = 1), "{x}" := "foo") }) +# collect results result <- m[] |> dplyr::bind_cols() daemons(0) @@ -130,14 +137,16 @@ process will complete and record its operations. By default, `{mirai}` uses an advanced dispatcher to manage task distribution efficiently, scheduling tasks in a First-In-First-Out manner and leveraging -`{nanonext}` primitives for zero-latency, resource-free task management. -However, its asynchronous execution can hinder reproducibility, especially with -random number generation (RNG) or tasks needing strict order. +[`{nanonext}`](https://shikokuchuo.net/nanonext/) primitives for zero-latency, +resource-free task management. However, its asynchronous execution can hinder +reproducibility, especially with random number generation (RNG) or tasks needing +strict order. To enhance reproducibility, `{mirai}` allows disabling the dispatcher, directly -connecting the host to daemons in a round-robin fashion. While less efficient, -this approach gives more control over task execution and is better suited for -ensuring consistent RNG and reproducible results. +connecting the host to daemons in a round-robin fashion. While less efficient, +this approach provides greater control over task execution and is better suited +for ensuring reproducibility by initializing +[L'Ecuyer-CMRG RNG streams](https://pubsonline.informs.org/doi/10.1287/opre.47.1.159). ```{r} library(mirai) @@ -174,8 +183,8 @@ results %>% ) ``` -In this example, we use `tribble` to define the simulation parameters and -initialize 4 daemons with dispatcher = FALSE and a fixed seed to ensure +In this example, we use `tribble()` to define the simulation parameters and +initialize 4 daemons with `dispatcher = FALSE` and a fixed seed to ensure consistent random number generation across tasks. The `mirai_map()` function parallelizes the drug efficacy simulation, and the results are combined using `bind_rows()` for further analysis. Disabling the dispatcher gives more control @@ -204,15 +213,17 @@ environments. While we've explored the basics of reproducibility in parallel computing with simple examples, the challenges extend beyond random number generation. Issues -such as process synchronization, using tools like lock files, become critical -in multi-process environments. Floating-point arithmetic adds complexity, +such as *process synchronization*, using tools like lock files (see for example +[`{filelock}`](https://r-lib.github.io/filelock/)), become critical +in multi-process environments. *Floating-point arithmetic* adds complexity, particularly when distributed across heterogeneous systems with varying -architectures and precision. Managing dependencies also becomes more -intricate as tasks grow in complexity, and ensuring error recovery in a +architectures and precision. *Managing dependencies* also becomes more +intricate as tasks grow in complexity, and ensuring *error recovery* in a controlled manner is vital to avoid crashes or inconsistent results in large-scale operations. -Powerful tools like `{targets}` and `{crew}` can help tackle these advanced +Powerful tools like [`{targets}`](https://docs.ropensci.org/targets/) and +[`{crew}`](https://wlandau.github.io/crew/) can help tackle these advanced challenges. `{targets}` is a workflow orchestration tool that manages dependencies, automates reproducible pipelines, and ensures consistent results across runs. Meanwhile, `{crew}` extends this by efficiently managing From b5838f8a9d62b23cd4eb69c5ae0704f14654f750 Mon Sep 17 00:00:00 2001 From: alekoure Date: Tue, 24 Sep 2024 17:58:34 +0000 Subject: [PATCH 04/18] Update wordlist --- inst/WORDLIST.txt | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/inst/WORDLIST.txt b/inst/WORDLIST.txt index b797af9a..54152541 100644 --- a/inst/WORDLIST.txt +++ b/inst/WORDLIST.txt @@ -1120,3 +1120,27 @@ astrazeneca laura MeetLaura needleman +Alexandros +clusterSetRNGStream +CMRG +doFuture +elbersb +filelock +HPC +Kouretsis +L'Ecuyer +mirai +nanonext +opre +parallelization +Parallelization +parallelizes +parallelMap +pubsonline +reprovision +shikokuchuo +tensio +tidylog +tidyr +wlandau +zzz From ce44a17c2c73e54569c365d5cd29a44c5f37e8a0 Mon Sep 17 00:00:00 2001 From: alekoure Date: Tue, 24 Sep 2024 17:58:43 +0000 Subject: [PATCH 05/18] Apply styler --- ...__reproducibility_vs.__parallelization.qmd | 23 +++++++++++-------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index 195c0bf5..bed6a4c5 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -80,16 +80,19 @@ library("dplyr", warn.conflicts = FALSE) daemons(4) # load libraries on each worker and set up logging to a file -everywhere({ - library("dplyr") - library("tidyr") - library("mirai") - library("tidylog", warn.conflicts = FALSE) - - # Define function to log messages to the log file - log_to_file <- \(txt) cat(txt, file = log_file, sep = "\n", append = TRUE) - options("tidylog.display" = list(message, log_to_file)) -}, log_file = "log.txt") +everywhere( + { + library("dplyr") + library("tidyr") + library("mirai") + library("tidylog", warn.conflicts = FALSE) + + # Define function to log messages to the log file + log_to_file <- \(txt) cat(txt, file = log_file, sep = "\n", append = TRUE) + options("tidylog.display" = list(message, log_to_file)) + }, + log_file = "log.txt" +) # perform computations in parallel m <- mirai_map(letters[1:3], \(x) { From 77fcb29f3f02c124b1d7e75a85de2ba8a1fb8e24 Mon Sep 17 00:00:00 2001 From: alekoure Date: Tue, 24 Sep 2024 18:08:58 +0000 Subject: [PATCH 06/18] Add description in yaml header --- ...ance__computing:__reproducibility_vs.__parallelization.qmd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index bed6a4c5..936d1e4e 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -2,7 +2,9 @@ title: "The Tension of High-Performance Computing: Reproducibility vs. Parallelization" author: - name: Alexandros Kouretsis -description: "" +description: "Discover how to manage parallel processing and ensure + reproducibility in drug development using the {mirai} package and other HPC + tools." # Note that the date below will be auto-updated when the post is merged. date: "2024-12-01" # Please do not use any non-default categories. From 5b788445a15edca459804e39acf263dc48e3f7a0 Mon Sep 17 00:00:00 2001 From: alekoure Date: Mon, 7 Oct 2024 22:39:21 +0000 Subject: [PATCH 07/18] Add logrx to execute the workflow --- .../cache_execution.rds | Bin 0 -> 255 bytes posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt | 3 - .../mirai_workflow.R | 19 ++ .../mirai_workflow.log | 171 ++++++++++++++++++ ...__reproducibility_vs.__parallelization.qmd | 125 ++++++++----- 5 files changed, 274 insertions(+), 44 deletions(-) create mode 100644 posts/zzz_DO_NOT_EDIT_the__tensio.../cache_execution.rds delete mode 100644 posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt create mode 100644 posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R create mode 100644 posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.log diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../cache_execution.rds b/posts/zzz_DO_NOT_EDIT_the__tensio.../cache_execution.rds new file mode 100644 index 0000000000000000000000000000000000000000..3e18f876978df4e58f4b6882ed214358fe9bddc0 GIT binary patch literal 255 zcmV;lc`(C zCYYx7>XW-p-2|iuL1-Y8e|{$41ST&_0FY2h0kUEV)$`NC<_18lI1#)rMiWX{@Ogje zBSyMajt2FCp~c#gTGwlJrS}+6M+(Tw=Sasb-j&a^*4;T2`%>m F0055YeGLEr literal 0 HcmV?d00001 diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt b/posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt deleted file mode 100644 index 65f1f6da..00000000 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../log.txt +++ /dev/null @@ -1,3 +0,0 @@ -mutate: new variable 'b' (character) with one unique value and 0% NA -mutate: new variable 'a' (character) with one unique value and 0% NA -mutate: new variable 'c' (character) with one unique value and 0% NA diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R new file mode 100644 index 00000000..731e4e19 --- /dev/null +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R @@ -0,0 +1,19 @@ +{ + library("mirai") + library("dplyr", warn.conflicts = FALSE) + log_file <- tempfile() + mirai::daemons(4) + mirai::everywhere({ + library("dplyr", warn.conflicts = FALSE) + library("tidylog", warn.conflicts = FALSE) + log_to_file <- function(txt) cat(txt, file = log_file, + sep = "\n", append = TRUE) + options(tidylog.display = list(message, log_to_file)) + }, log_file = log_file) + m <- mirai_map(letters[1:5], function(x) { + mutate(tidyr::tibble(.rows = 1), `:=`("{x}", "foo")) + }) + result <- dplyr::bind_cols(m[]) + mirai::daemons(0) + print(list(logs = readLines(log_file), result = result)) +} diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.log b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.log new file mode 100644 index 00000000..1a089751 --- /dev/null +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.log @@ -0,0 +1,171 @@ +-------------------------------------------------------------------------------- +- logrx Metadata - +-------------------------------------------------------------------------------- +This log was generated using logrx 0.3.1 +logrx package version: 0.3.1 +logrx build: RSPM (R 4.3.0) +logrx link to repository: https://github.com/pharmaverse/logrx +-------------------------------------------------------------------------------- +- User and File Information - +-------------------------------------------------------------------------------- +User: ale +File Name: mirai_workflow.R +File Path: /home/ale/blog/posts/zzz_DO_NOT_EDIT_the__tensio... +File HashSum: 93e86d1e6a59ab5475d3c9fbd1948be36228a1fe +-------------------------------------------------------------------------------- +- Session Information - +-------------------------------------------------------------------------------- +─ Session info ─────────────────────────────────────────────────────────────── + setting value + version R version 4.3.2 (2023-10-31) + os Ubuntu 22.04.3 LTS + system x86_64, linux-gnu + ui X11 + language (EN) + collate en_US.UTF-8 + ctype en_US.UTF-8 + tz Etc/UTC + date 2024-10-07 + pandoc 3.1.1 @ /usr/lib/rstudio-server/bin/quarto/bin/tools/ (via rmarkdown) + +─ Packages ─────────────────────────────────────────────────────────────────── + package * version date (UTC) lib source + backports 1.4.1 2021-12-13 [1] RSPM (R 4.3.0) + callr 3.7.3 2022-11-02 [1] RSPM (R 4.3.0) + cli 3.6.2 2023-12-11 [1] RSPM (R 4.3.0) + crayon 1.5.2 2022-09-29 [1] RSPM (R 4.3.0) + cyclocomp 1.1.1 2023-08-30 [1] RSPM (R 4.3.0) + data.table 1.15.0 2024-01-30 [1] RSPM (R 4.3.0) + desc 1.4.3 2023-12-10 [1] RSPM (R 4.3.0) + digest 0.6.34 2024-01-11 [1] RSPM (R 4.3.0) + dplyr * 1.1.4 2023-11-17 [1] RSPM (R 4.3.0) + evaluate 0.23 2023-11-01 [1] RSPM (R 4.3.0) + fansi 1.0.6 2023-12-08 [1] RSPM (R 4.3.0) + fastmap 1.2.0 2024-05-15 [1] RSPM (R 4.3.0) + generics 0.1.3 2022-07-05 [1] RSPM (R 4.3.0) + glue 1.7.0 2024-01-09 [1] RSPM (R 4.3.0) + htmltools 0.5.7 2023-11-03 [1] RSPM (R 4.3.0) + htmlwidgets 1.6.4 2023-12-06 [1] RSPM (R 4.3.0) + httpuv 1.6.15 2024-03-26 [1] RSPM (R 4.3.0) + jsonlite 1.8.8 2023-12-04 [1] RSPM (R 4.3.0) + knitr 1.45 2023-10-30 [1] RSPM (R 4.3.0) + later 1.3.2 2023-12-06 [1] RSPM (R 4.3.0) + lazyeval 0.2.2 2019-03-15 [1] RSPM (R 4.3.0) + lifecycle 1.0.4 2023-11-07 [1] RSPM (R 4.3.0) + lintr 3.1.2 2024-03-25 [1] RSPM (R 4.3.0) + logrx 0.3.1 2024-04-12 [1] RSPM (R 4.3.0) + magrittr 2.0.3 2022-03-30 [1] RSPM (R 4.3.0) + mime 0.12 2021-09-28 [1] RSPM (R 4.3.0) + miniUI 0.1.1.1 2018-05-18 [1] RSPM (R 4.3.0) + mirai * 1.2.0 2024-08-18 [1] RSPM (R 4.3.0) + nanonext 1.2.1 2024-08-19 [1] RSPM (R 4.3.0) + pillar 1.9.0 2023-03-22 [1] RSPM (R 4.3.0) + pkgconfig 2.0.3 2019-09-22 [1] RSPM (R 4.3.0) + processx 3.8.3 2023-12-10 [1] RSPM (R 4.3.0) + promises 1.3.0 2024-04-05 [1] RSPM (R 4.3.0) + ps 1.7.6 2024-01-18 [1] RSPM (R 4.3.0) + purrr 1.0.2 2023-08-10 [1] RSPM (R 4.3.0) + R6 2.5.1 2021-08-19 [1] RSPM (R 4.3.0) + Rcpp 1.0.12 2024-01-09 [1] RSPM (R 4.3.0) + remotes 2.5.0 2024-03-17 [1] RSPM (R 4.3.0) + rex 1.2.1 2021-11-26 [1] RSPM (R 4.3.0) + rlang 1.1.3 2024-01-10 [1] RSPM (R 4.3.0) + rmarkdown 2.28 2024-08-17 [1] RSPM (R 4.3.0) + rstudioapi 0.15.0 2023-07-07 [1] RSPM (R 4.3.0) + sessioninfo 1.2.2 2021-12-06 [1] RSPM (R 4.3.0) + shiny 1.9.1 2024-08-01 [1] RSPM (R 4.3.0) + stringi 1.8.3 2023-12-11 [1] RSPM (R 4.3.0) + stringr 1.5.1 2023-11-14 [1] RSPM (R 4.3.0) + tibble 3.2.1 2023-03-20 [1] RSPM (R 4.3.0) + tidyr 1.3.1 2024-01-24 [1] RSPM (R 4.3.0) + tidyselect 1.2.0 2022-10-10 [1] RSPM (R 4.3.0) + utf8 1.2.4 2023-10-22 [1] RSPM (R 4.3.0) + vctrs 0.6.5 2023-12-01 [1] RSPM (R 4.3.0) + waiter 0.2.5 2022-01-03 [1] RSPM (R 4.3.0) + withr 3.0.0 2024-01-16 [1] RSPM (R 4.3.0) + xfun 0.42 2024-02-08 [1] RSPM (R 4.3.0) + xml2 1.3.6 2023-12-04 [1] RSPM (R 4.3.0) + xtable 1.8-4 2019-04-21 [1] RSPM (R 4.3.0) + yaml 2.3.8 2023-12-11 [1] RSPM (R 4.3.0) + + [1] /usr/local/lib/R/site-library + [2] /usr/local/lib/R/library + +─ External software ────────────────────────────────────────────────────────── + setting value + cairo 1.16.0 + cairoFT + pango 1.50.6 + png 1.6.37 + jpeg 8.0 + tiff LIBTIFF, Version 4.3.0 + tcl + curl 7.81.0 + zlib 1.2.11 + bzlib 1.0.8, 13-Jul-2019 + xz 5.2.5 + PCRE 10.39 2021-10-29 + ICU 70.1 + TRE TRE 0.8.0 R_fixes (BSD) + iconv glibc 2.35 + readline 8.1 + BLAS /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 + + lapack /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so + lapack_version 3.10.0 + +─ Python configuration ─────────────────────────────────────────────────────── + Python is not available + +────────────────────────────────────────────────────────────────────────────── +-------------------------------------------------------------------------------- +- Masked Functions - +-------------------------------------------------------------------------------- +function `plot` from {package:base} by package:graphics +function `body<-` from {package:base} by package:methods +function `kronecker` from {package:base} by package:methods +-------------------------------------------------------------------------------- +- Used Package and Functions - +-------------------------------------------------------------------------------- +{!!! NOT FOUND !!!} `:=` +{package:base} library, tempfile, cat, options, list, print, readLines +{package:dplyr} mutate, bind_cols +{package:mirai} daemons, everywhere, mirai_map +{package:tidyr} tibble +-------------------------------------------------------------------------------- +- Program Run Time Information - +-------------------------------------------------------------------------------- +Start time: 2024-10-07 22:16:34 UTC +End time: 2024-10-07 22:16:36 UTC +Run time: 2 seconds +-------------------------------------------------------------------------------- +- Errors and Warnings - +-------------------------------------------------------------------------------- +Errors: + + +Warnings: + +-------------------------------------------------------------------------------- +- Messages, Output, and Result - +-------------------------------------------------------------------------------- + +Result: + $logs + [1] "mutate: new variable 'b' (character) with one unique value and 0% NAmutate: new variable 'a' (character) with one unique value and 0% NA" + [2] "" + [3] "mutate: new variable 'd' (character) with one unique value and 0% NA" + [4] "mutate: new variable 'e' (character) with one unique value and 0% NA" + [5] "mutate: new variable 'c' (character) with one unique value and 0% NA" + + $result + # A tibble: 1 × 5 + a b c d e + + 1 foo foo foo foo foo + +-------------------------------------------------------------------------------- +- Log Output File - +-------------------------------------------------------------------------------- +Log name: mirai_workflow.log +Log path: /home/ale/blog/posts/zzz_DO_NOT_EDIT_the__tensio... diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index 936d1e4e..7698d0b8 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -72,46 +72,56 @@ tracking operations due to the asynchronous nature of task execution and logging. For this, we will also use the [`{tidylog}`](https://github.com/elbersb/tidylog) package, which tracks and logs `{dplyr}` operations, providing insight into how the computations are executed -across multiple workers. - -```{r, message=FALSE, eval=FALSE} -library("mirai") -library("dplyr", warn.conflicts = FALSE) - -# start parallel workers -daemons(4) - -# load libraries on each worker and set up logging to a file -everywhere( - { - library("dplyr") - library("tidyr") - library("mirai") - library("tidylog", warn.conflicts = FALSE) - - # Define function to log messages to the log file - log_to_file <- \(txt) cat(txt, file = log_file, sep = "\n", append = TRUE) - options("tidylog.display" = list(message, log_to_file)) - }, - log_file = "log.txt" -) - -# perform computations in parallel -m <- mirai_map(letters[1:3], \(x) { - mutate(tibble(.rows = 1), "{x}" := "foo") +across multiple workers. + +Our workflow will be constructed in a script and executed using the `{logrx}` +package from Pharmaverse. We will craft the workflow as an expression using the +`base::substitute()` function, which will later be used to generate the complete +workflow script for parallel execution. + +```{r, message=FALSE, eval=TRUE} +mirai_workflow <- substitute({ + library("mirai") + library("dplyr", warn.conflicts = FALSE) + log_file <- tempfile() + # start parallel workers + mirai::daemons(4) + + # load libraries on each worker and set up logging to a file + mirai::everywhere( + { + library("dplyr", warn.conflicts = FALSE) + library("tidylog", warn.conflicts = FALSE) + + # Define function to log messages to the log file + log_to_file <- \(txt) cat(txt, file = log_file, sep = "\n", append = TRUE) + options("tidylog.display" = list(message, log_to_file)) + }, + log_file = log_file + ) + + # perform computations in parallel + m <- mirai_map(letters[1:5], \(x) { + mutate(tidyr::tibble(.rows = 1), "{x}" := "foo") + }) + + # collect results + result <- m[] |> dplyr::bind_cols() + + mirai::daemons(0) + + print( + list( + logs = readLines(log_file), + result = result + ) + ) }) - -# collect results -result <- m[] |> dplyr::bind_cols() - -daemons(0) - -return(result) ``` In the above code chunk, we set up a parallel processing environment using the `{mirai}` package. The function `mirai_map()` is used to apply a mutating -function in parallel to a tibble for each element of `letters[1:3]`, logging the +function in parallel to a tibble for each element of `letters`, logging the operations to a file using the `{tidylog}` package. However, while we can log each operation as it happens, due to the parallel nature of `{mirai}`, the logging does not occur in a controlled or sequential order. *Each daemon @@ -128,15 +138,48 @@ asynchronous logging may create challenges in *tracing* and *debugging* the process, as entries in the log file could appear out of order, giving an incomplete or misleading representation of the task flow. +Let's first save the code to an R script called `mirai_workflow.R`. This step +helps ensure that the execution can be properly tracked and documented: + ```{r} -readLines("log.txt") +mirai_workflow |> + deparse() |> + writeLines("mirai_workflow.R") +``` + +Next, we execute the script using `logrx::axecute()`, which not only runs the +workflow but also logs key metadata and outputs for enhanced traceability and +reproducibility, as outlined in the `{logrx}` vignettes: + +```{r, eval=FALSE} +logrx::axecute("mirai_workflow.R", to_report = "result") +``` + +```{r cache_exec, eval=FALSE, echo=FALSE} +#run this to refresh cache and get a non ordered log file +res_to_cache <- source("mirai_workflow.R") +saveRDS(res_to_cache$value, "cache_execution.rds") +``` + +```{r, echo=FALSE} +readRDS("cache_execution.rds") ``` -In the above code, when we read the contents of the log file, you will notice -that the logs are not in the same order as the commands were dispatched. This -demonstrates the inherent difficulty in managing the order of logging in -parallel tasks, especially when there is no guarantee on how quickly each -process will complete and record its operations. +Upon examining the log file generated, you'll notice that the entries are not in +the same order as the commands were dispatched. This illustrates the inherent +difficulty in maintaining a consistent logging sequence for parallel tasks, +especially since the timing of each process completion and log recording is +unpredictable. + +Additionally, it is worth noting that `logrx` does not capture the logging +performed by `{tidylog}` during the execution of tasks on `{mirai}` daemons. +This is because the daemons run as independent R processes, and the logging +messages are not propagated back to the parent process in a straightforward +manner. As described in `{mirai}`'s documentation, daemons are responsible for +handling tasks asynchronously, and messages logged within these processes do not +automatically integrate into the parent session. Therefore, the only way to +access `{tidylog}` messages is indirectly, by reading the dedicated log file +(`log.txt`) that each worker writes to during execution. ### Task Dispatching and RNG Management From c112049aab69387182d0f47489023b6cd7a9ada1 Mon Sep 17 00:00:00 2001 From: alekoure Date: Mon, 7 Oct 2024 23:19:21 +0000 Subject: [PATCH 08/18] Adding floating point paragraph --- .../mirai_workflow.R | 8 ++--- ...__reproducibility_vs.__parallelization.qmd | 32 ++++++++++++------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R index 731e4e19..98bb01d8 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R @@ -1,17 +1,17 @@ { library("mirai") - library("dplyr", warn.conflicts = FALSE) + library("dplyr") log_file <- tempfile() mirai::daemons(4) mirai::everywhere({ - library("dplyr", warn.conflicts = FALSE) - library("tidylog", warn.conflicts = FALSE) + library("dplyr") + library("tidylog") log_to_file <- function(txt) cat(txt, file = log_file, sep = "\n", append = TRUE) options(tidylog.display = list(message, log_to_file)) }, log_file = log_file) m <- mirai_map(letters[1:5], function(x) { - mutate(tidyr::tibble(.rows = 1), `:=`("{x}", "foo")) + mutate(tibble(.rows = 1), `:=`("{x}", "foo")) }) result <- dplyr::bind_cols(m[]) mirai::daemons(0) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index 7698d0b8..b53170cb 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -82,16 +82,18 @@ workflow script for parallel execution. ```{r, message=FALSE, eval=TRUE} mirai_workflow <- substitute({ library("mirai") - library("dplyr", warn.conflicts = FALSE) + library("dplyr") + log_file <- tempfile() + # start parallel workers mirai::daemons(4) # load libraries on each worker and set up logging to a file mirai::everywhere( { - library("dplyr", warn.conflicts = FALSE) - library("tidylog", warn.conflicts = FALSE) + library("dplyr") + library("tidylog") # Define function to log messages to the log file log_to_file <- \(txt) cat(txt, file = log_file, sep = "\n", append = TRUE) @@ -102,11 +104,11 @@ mirai_workflow <- substitute({ # perform computations in parallel m <- mirai_map(letters[1:5], \(x) { - mutate(tidyr::tibble(.rows = 1), "{x}" := "foo") + mutate(tibble(.rows = 1), "{x}" := "foo") }) # collect results - result <- m[] |> dplyr::bind_cols() + result <- m[] |> bind_cols() mirai::daemons(0) @@ -209,19 +211,19 @@ cohorts <- tribble( ) # Start daemons with consistent RNG streams -x <- daemons(4, dispatcher = FALSE, seed = 123) +x <- mirai::daemons(4, dispatcher = FALSE, seed = 123) # Parallel simulation for each row of the cohorts table -m <- mirai_map(cohorts, \(patient_count, mean_effect, sd_effect) { +m <- mirai::mirai_map(cohorts, \(patient_count, mean_effect, sd_effect) { dplyr::tibble( patient_id = 1:patient_count, efficacy = rnorm(patient_count, mean = mean_effect, sd = sd_effect) ) }) -results <- m[] |> dplyr::bind_rows() +results <- m[] |> bind_rows() -x <- daemons(0, dispatcher = FALSE) +x <- mirai::daemons(0, dispatcher = FALSE) results %>% group_by(patient_id) %>% @@ -244,9 +246,6 @@ inefficient resource utilization when tasks are unevenly distributed, as some daemons may remain idle. While reproducibility is prioritized, we sacrifice some performance, especially when handling tasks with varying workloads. -> "In prioritizing reproducibility, we inevitably sacrifice some performance, -especially when tasks with unequal workloads are distributed across daemons." - Reproducibility becomes more complex when using parallelization frameworks like `{parallelMap}`, `{doFuture}`, and `{future}`, as each handles random number generation (RNG) differently. While `set.seed()` is sufficient for @@ -257,6 +256,15 @@ specific adjustments to ensure consistent results, emphasizing the importance of understanding how each backend manages RNG in parallel environments. +Even without random numbers, seemingly simple tasks—like summing over +floating-point numbers—can yield different results in parallel processing. This +is because floating-point numbers are not represented exactly in the system, +and the final result depends on the order of operations. In a parallel +environment, where tasks are executed asynchronously, the order in which +operations occur may differ, leading to slight variations in the final output. +These differences become more pronounced in large-scale computations, +complicating the reproducibility of results. + ## Closing Thoughts While we've explored the basics of reproducibility in parallel computing with From 5e4618dbdaed36aeec2a91d6c63b841b6a6b4d98 Mon Sep 17 00:00:00 2001 From: alekoure Date: Mon, 7 Oct 2024 23:39:20 +0000 Subject: [PATCH 09/18] Add a link to round-robin explanation --- .../cache_execution.rds | Bin 255 -> 262 bytes .../mirai_workflow.R | 4 ++-- ...g:__reproducibility_vs.__parallelization.qmd | 14 ++++++++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../cache_execution.rds b/posts/zzz_DO_NOT_EDIT_the__tensio.../cache_execution.rds index 3e18f876978df4e58f4b6882ed214358fe9bddc0..880f563ccee75652ee5f9e2cfe065d7e71eb1b26 100644 GIT binary patch literal 262 zcmV+h0r~zPiwFP!000001FexkOT#b}fM1hV+&~a^+ASBEA{LM0O%EPDP4OnL&D$(w zDW<8t`pccKTLPsAL1^G5-wQ9_3(4CY03@VRfXo;nds#o-+yRIMCxYX=--Hxqygu$) zkDl(8rA~dJtFSPnR=84^mG!hGccr?@f2i7$YHjt|5|;lc`(C zCYYx7>XW-p-2|iuL1-Y8e|{$41ST&_0FY2h0kUEV)$`NC<_18lI1#)rMiWX{@Ogje zBSyMajt2FCp~c#gTGwlJrS}+6M+(Tw=Sasb-j&a^*4;T2`%>m F0055YeGLEr diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R index 98bb01d8..9d70f056 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R @@ -11,9 +11,9 @@ options(tidylog.display = list(message, log_to_file)) }, log_file = log_file) m <- mirai_map(letters[1:5], function(x) { - mutate(tibble(.rows = 1), `:=`("{x}", "foo")) + mutate(tibble(.rows = 1), `:=`("{x}", sample(1:100, 1))) }) - result <- dplyr::bind_cols(m[]) + result <- bind_cols(m[]) mirai::daemons(0) print(list(logs = readLines(log_file), result = result)) } diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index b53170cb..149407b1 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -104,7 +104,7 @@ mirai_workflow <- substitute({ # perform computations in parallel m <- mirai_map(letters[1:5], \(x) { - mutate(tibble(.rows = 1), "{x}" := "foo") + mutate(tibble(.rows = 1), "{x}" := sample(1:100, 1)) }) # collect results @@ -151,7 +151,7 @@ mirai_workflow |> Next, we execute the script using `logrx::axecute()`, which not only runs the workflow but also logs key metadata and outputs for enhanced traceability and -reproducibility, as outlined in the `{logrx}` vignettes: +reproducibility: ```{r, eval=FALSE} logrx::axecute("mirai_workflow.R", to_report = "result") @@ -192,10 +192,12 @@ resource-free task management. However, its asynchronous execution can hinder reproducibility, especially with random number generation (RNG) or tasks needing strict order. -To enhance reproducibility, `{mirai}` allows disabling the dispatcher, directly -connecting the host to daemons in a round-robin fashion. While less efficient, -this approach provides greater control over task execution and is better suited -for ensuring reproducibility by initializing +To enhance reproducibility, `{mirai}` allows disabling the dispatcher which +usually decides the order in which tasks are run. Instead, it connects directly +to the workers one by one in a simple order +(see [round-robin](https://en.wikipedia.org/wiki/Round-robin_scheduling)). While +less efficient, this approach provides greater control over task execution and +is better suited for ensuring reproducibility by initializing [L'Ecuyer-CMRG RNG streams](https://pubsonline.informs.org/doi/10.1287/opre.47.1.159). ```{r} From 2f0e088ce38738b107d0a06815941dea7c4520fd Mon Sep 17 00:00:00 2001 From: alekoure Date: Mon, 7 Oct 2024 23:49:19 +0000 Subject: [PATCH 10/18] Add daemon definition --- ...mputing:__reproducibility_vs.__parallelization.qmd | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index 149407b1..805d71d8 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -74,10 +74,11 @@ logging. For this, we will also use the `{dplyr}` operations, providing insight into how the computations are executed across multiple workers. -Our workflow will be constructed in a script and executed using the `{logrx}` -package from Pharmaverse. We will craft the workflow as an expression using the -`base::substitute()` function, which will later be used to generate the complete -workflow script for parallel execution. +We'll create our workflow in a script and run it using the `{logrx}` package +from Pharmaverse. The workflow will be written as an expression using +`base::substitute()`, which will help generate the complete script. In our +example, we'll start four daemons. A daemon is a background process that runs in +the background continuously and handles specific computing tasks. ```{r, message=FALSE, eval=TRUE} mirai_workflow <- substitute({ @@ -126,7 +127,7 @@ In the above code chunk, we set up a parallel processing environment using the function in parallel to a tibble for each element of `letters`, logging the operations to a file using the `{tidylog}` package. However, while we can log each operation as it happens, due to the parallel nature of `{mirai}`, the -logging does not occur in a controlled or sequential order. *Each daemon +logging does not occur in a controlled or sequential order. *Each daemon executes its task independently, and the order of logging in the file will depend on the completion times of these parallel processes rather than the intended flow of operations.* From 68a2dc605965eb17fc9e7afafb8d7cbfccbdd3f3 Mon Sep 17 00:00:00 2001 From: alekoure Date: Tue, 8 Oct 2024 00:25:23 +0000 Subject: [PATCH 11/18] Adding explicit description of the simulation --- ...__reproducibility_vs.__parallelization.qmd | 58 ++++++++++--------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index 805d71d8..5a612470 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -201,6 +201,11 @@ less efficient, this approach provides greater control over task execution and is better suited for ensuring reproducibility by initializing [L'Ecuyer-CMRG RNG streams](https://pubsonline.informs.org/doi/10.1287/opre.47.1.159). +In the following example, we simulate drug efficacy across different patient +cohorts using parallel processing with the `{mirai}` package. We define three +cohorts, each with a different mean drug effect and standard deviation, and +initialize four daemons to handle the computations. + ```{r} library(mirai) library(dplyr, warn.conflicts = FALSE) @@ -236,37 +241,34 @@ results %>% ) ``` -In this example, we use `tribble()` to define the simulation parameters and +We used `tribble()` to define the simulation parameters and initialize 4 daemons with `dispatcher = FALSE` and a fixed seed to ensure consistent random number generation across tasks. The `mirai_map()` function parallelizes the drug efficacy simulation, and the results are combined using -`bind_rows()` for further analysis. Disabling the dispatcher gives more control -over task execution, ensuring reproducibility. If you repeat the computation you -will notice that it generates consistent results. - -However, this approach comes at a cost. Disabling the dispatcher may lead to -inefficient resource utilization when tasks are unevenly distributed, as some -daemons may remain idle. While reproducibility is prioritized, we sacrifice -some performance, especially when handling tasks with varying workloads. - -Reproducibility becomes more complex when using parallelization frameworks -like `{parallelMap}`, `{doFuture}`, and `{future}`, as each handles random -number generation (RNG) differently. While `set.seed()` is sufficient for -sequential tasks, parallel computations require managing RNG streams carefully, -often using types like "L'Ecuyer-CMRG" or functions such as -`clusterSetRNGStream()` for synchronization. Each framework requires -specific adjustments to ensure consistent results, emphasizing the -importance of understanding how each backend manages RNG in parallel -environments. - -Even without random numbers, seemingly simple tasks—like summing over -floating-point numbers—can yield different results in parallel processing. This -is because floating-point numbers are not represented exactly in the system, -and the final result depends on the order of operations. In a parallel -environment, where tasks are executed asynchronously, the order in which -operations occur may differ, leading to slight variations in the final output. -These differences become more pronounced in large-scale computations, -complicating the reproducibility of results. +`bind_rows()` for further analysis. + +Disabling the dispatcher gives more control over task execution, ensuring +reproducibility. If you repeat the computation you +will notice that it generates consistent results. However, this approach comes +at a cost. Disabling the dispatcher may lead to inefficient resource utilization +when tasks are unevenly distributed, as some daemons may remain idle. While +reproducibility is prioritized, we sacrifice some performance, especially +when handling tasks with varying workloads. + +Reproducibility becomes trickier when using parallelization frameworks like +`{parallelMap}`, `{doFuture}`, and `{future}`, as each handles random number +generation (RNG) differently. While `set.seed()` works for sequential tasks, +parallel tasks need careful management of RNG streams, often using specific +methods like "L’Ecuyer-CMRG" or functions like `clusterSetRNGStream()` to keep +results consistent. Each framework has its own approach, so it's important to +understand how each one manages RNG to ensure reproducibility. + +Even without random numbers, simple tasks—like adding floating-point numbers—can +give different results in parallel processing. This happens because +floating-point numbers aren’t exactly represented, and the order of operations +can affect the outcome. In parallel environments, where tasks finish in +different orders, these small differences can add up, making it harder to +reproduce results in large computations. ## Closing Thoughts From 3d5d06a67443ac49c768517d21796890d0681057 Mon Sep 17 00:00:00 2001 From: alekoure Date: Tue, 8 Oct 2024 00:27:15 +0000 Subject: [PATCH 12/18] Fix reference to log ile --- ...rmance__computing:__reproducibility_vs.__parallelization.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index 5a612470..e53bb909 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -182,7 +182,7 @@ manner. As described in `{mirai}`'s documentation, daemons are responsible for handling tasks asynchronously, and messages logged within these processes do not automatically integrate into the parent session. Therefore, the only way to access `{tidylog}` messages is indirectly, by reading the dedicated log file -(`log.txt`) that each worker writes to during execution. +(`log_file`) that each worker writes to during execution. ### Task Dispatching and RNG Management From cb82409a75f43deaa5e9986c83816a563b8f656f Mon Sep 17 00:00:00 2001 From: alekoure Date: Sun, 13 Oct 2024 08:15:14 +0000 Subject: [PATCH 13/18] Update the dispatcher argument to "none" --- ...ting:__reproducibility_vs.__parallelization.qmd | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index e53bb909..e96b9c57 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -180,8 +180,8 @@ This is because the daemons run as independent R processes, and the logging messages are not propagated back to the parent process in a straightforward manner. As described in `{mirai}`'s documentation, daemons are responsible for handling tasks asynchronously, and messages logged within these processes do not -automatically integrate into the parent session. Therefore, the only way to -access `{tidylog}` messages is indirectly, by reading the dedicated log file +automatically integrate into the parent session. Therefore, we access +`{tidylog}` messages indirectly, by reading the dedicated log file (`log_file`) that each worker writes to during execution. ### Task Dispatching and RNG Management @@ -219,7 +219,11 @@ cohorts <- tribble( ) # Start daemons with consistent RNG streams -x <- mirai::daemons(4, dispatcher = FALSE, seed = 123) +x <- mirai::daemons( + n = 4, + dispatcher = "none", # For mirai versions below 1.3.0, use dispatcher = FALSE + seed = 123 +) # Parallel simulation for each row of the cohorts table m <- mirai::mirai_map(cohorts, \(patient_count, mean_effect, sd_effect) { @@ -231,7 +235,7 @@ m <- mirai::mirai_map(cohorts, \(patient_count, mean_effect, sd_effect) { results <- m[] |> bind_rows() -x <- mirai::daemons(0, dispatcher = FALSE) +x <- mirai::daemons(0, dispatcher = "none") results %>% group_by(patient_id) %>% @@ -242,7 +246,7 @@ results %>% ``` We used `tribble()` to define the simulation parameters and -initialize 4 daemons with `dispatcher = FALSE` and a fixed seed to ensure +initialize 4 daemons with `dispatcher = "none"` and a fixed seed to ensure consistent random number generation across tasks. The `mirai_map()` function parallelizes the drug efficacy simulation, and the results are combined using `bind_rows()` for further analysis. From 72d007086c98aeabad78b5d0a2b32b22b2db0f7c Mon Sep 17 00:00:00 2001 From: alekoure Date: Sun, 13 Oct 2024 08:19:36 +0000 Subject: [PATCH 14/18] Update WORDLIST.txt --- inst/WORDLIST.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/inst/WORDLIST.txt b/inst/WORDLIST.txt index 54152541..086d9234 100644 --- a/inst/WORDLIST.txt +++ b/inst/WORDLIST.txt @@ -1144,3 +1144,6 @@ tidylog tidyr wlandau zzz +axecute +readRDS +saveRDS From e1fb73b970847b08ae297390971c5980d4eaf0f6 Mon Sep 17 00:00:00 2001 From: alekoure Date: Sun, 13 Oct 2024 08:20:08 +0000 Subject: [PATCH 15/18] Apply styler fixes --- .../mirai_workflow.R | 41 +++++++++++-------- ...__reproducibility_vs.__parallelization.qmd | 20 ++++----- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R index 9d70f056..ad953a4d 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../mirai_workflow.R @@ -1,19 +1,26 @@ { - library("mirai") - library("dplyr") - log_file <- tempfile() - mirai::daemons(4) - mirai::everywhere({ - library("dplyr") - library("tidylog") - log_to_file <- function(txt) cat(txt, file = log_file, - sep = "\n", append = TRUE) - options(tidylog.display = list(message, log_to_file)) - }, log_file = log_file) - m <- mirai_map(letters[1:5], function(x) { - mutate(tibble(.rows = 1), `:=`("{x}", sample(1:100, 1))) - }) - result <- bind_cols(m[]) - mirai::daemons(0) - print(list(logs = readLines(log_file), result = result)) + library("mirai") + library("dplyr") + log_file <- tempfile() + mirai::daemons(4) + mirai::everywhere( + { + library("dplyr") + library("tidylog") + log_to_file <- function(txt) { + cat(txt, + file = log_file, + sep = "\n", append = TRUE + ) + } + options(tidylog.display = list(message, log_to_file)) + }, + log_file = log_file + ) + m <- mirai_map(letters[1:5], function(x) { + mutate(tibble(.rows = 1), `:=`("{x}", sample(1:100, 1))) + }) + result <- bind_cols(m[]) + mirai::daemons(0) + print(list(logs = readLines(log_file), result = result)) } diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index e96b9c57..03505eeb 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -84,35 +84,35 @@ the background continuously and handles specific computing tasks. mirai_workflow <- substitute({ library("mirai") library("dplyr") - + log_file <- tempfile() - + # start parallel workers mirai::daemons(4) - + # load libraries on each worker and set up logging to a file mirai::everywhere( { library("dplyr") library("tidylog") - + # Define function to log messages to the log file log_to_file <- \(txt) cat(txt, file = log_file, sep = "\n", append = TRUE) options("tidylog.display" = list(message, log_to_file)) }, log_file = log_file ) - + # perform computations in parallel m <- mirai_map(letters[1:5], \(x) { mutate(tibble(.rows = 1), "{x}" := sample(1:100, 1)) }) - + # collect results result <- m[] |> bind_cols() - + mirai::daemons(0) - + print( list( logs = readLines(log_file), @@ -159,7 +159,7 @@ logrx::axecute("mirai_workflow.R", to_report = "result") ``` ```{r cache_exec, eval=FALSE, echo=FALSE} -#run this to refresh cache and get a non ordered log file +# run this to refresh cache and get a non ordered log file res_to_cache <- source("mirai_workflow.R") saveRDS(res_to_cache$value, "cache_execution.rds") ``` @@ -235,7 +235,7 @@ m <- mirai::mirai_map(cohorts, \(patient_count, mean_effect, sd_effect) { results <- m[] |> bind_rows() -x <- mirai::daemons(0, dispatcher = "none") +x <- mirai::daemons(0, dispatcher = "none") results %>% group_by(patient_id) %>% From 061216d2686fb956255116537d187be6ebee106e Mon Sep 17 00:00:00 2001 From: alekoure Date: Sun, 13 Oct 2024 08:40:43 +0000 Subject: [PATCH 16/18] Adding affiliation --- ...ormance__computing:__reproducibility_vs.__parallelization.qmd | 1 + 1 file changed, 1 insertion(+) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index 03505eeb..09d7dbc8 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -2,6 +2,7 @@ title: "The Tension of High-Performance Computing: Reproducibility vs. Parallelization" author: - name: Alexandros Kouretsis + - name: APPSILON description: "Discover how to manage parallel processing and ensure reproducibility in drug development using the {mirai} package and other HPC tools." From f85cc941237cf13916c51f56be47efc479e3207a Mon Sep 17 00:00:00 2001 From: alekoure Date: Sun, 13 Oct 2024 08:51:09 +0000 Subject: [PATCH 17/18] Fix typo --- ...rmance__computing:__reproducibility_vs.__parallelization.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index 09d7dbc8..ea4540b8 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -302,7 +302,7 @@ trying for maintaining control and reproducibility. This blog post has hopefully increased your intuition about the challenges that may arise when incorporating HPC into your work. By understanding these complexities, you’ll be better positioned to make informed decisions about the -trade-offs—such as balancing performance and reproducibility — that are most +trade-offs—such as balancing performance and reproducibility—that are most relevant to your specific case. As your computations scale, finding the right balance between efficiency, accuracy, and reproducibility will be crucial for the success of your projects. From cad2808b2bbd4063b22f927210e263d7135a00ad Mon Sep 17 00:00:00 2001 From: alekoure Date: Wed, 16 Oct 2024 10:39:17 +0000 Subject: [PATCH 18/18] Replace RNG reference link with doi URL --- ...rmance__computing:__reproducibility_vs.__parallelization.qmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd index ea4540b8..be355758 100644 --- a/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd +++ b/posts/zzz_DO_NOT_EDIT_the__tensio.../the__tension_of__high-_performance__computing:__reproducibility_vs.__parallelization.qmd @@ -200,7 +200,7 @@ to the workers one by one in a simple order (see [round-robin](https://en.wikipedia.org/wiki/Round-robin_scheduling)). While less efficient, this approach provides greater control over task execution and is better suited for ensuring reproducibility by initializing -[L'Ecuyer-CMRG RNG streams](https://pubsonline.informs.org/doi/10.1287/opre.47.1.159). +[L'Ecuyer-CMRG RNG streams](doi:10.1287/opre.47.1.159). In the following example, we simulate drug efficacy across different patient cohorts using parallel processing with the `{mirai}` package. We define three