title | author | date | output | ||||
---|---|---|---|---|---|---|---|
Chapter 20 |
Julin Maloof |
2023-08-03 |
|
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the �]8;;http://conflicted.r-lib.org/�conflicted package�]8;;� to force all conflicts to become errors
library(rlang)
##
## Attaching package: 'rlang'
##
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
Evaluation is the developer equivalent of unquotation...it allows the user to evaluate quoted expressions
Two concepts: quosure, captures an expression and its environment, and data masking, which allows ealuation in the context of a data frame.
quasiquotation, quosures, and data maksing make up tidy evaluation
eval
takes two arguments, an expression and an enviornment (calling environment is the default)
x <- 10
eval(expr(x))
## [1] 10
#> [1] 10
y <- 2
eval(expr(x + y))
## [1] 12
#> [1] 12
#>
#
eval(expr(x + y), env(x = 1000))
## [1] 1002
#> [1] 1002
Local allows you to create temporary variables:
# Clean up variables created earlier
rm(x, y)
foo <- local({
x <- 10
y <- 200
x + y
})
foo
## [1] 210
#> [1] 210
x
## Error in eval(expr, envir, enclos): object 'x' not found
#> Error in eval(expr, envir, enclos): object 'x' not found
y
## Error in eval(expr, envir, enclos): object 'y' not found
#> Error in eval(expr, envir, enclos): object 'y' not found
capture expression, create new environment with calling environment as parent:
local2 <- function(expr) {
env <- env(caller_env())
eval(enexpr(expr), env)
}
foo <- local2({
x <- 10
y <- 200
x + y
})
foo
## [1] 210
#> [1] 210
x
## Error in eval(expr, envir, enclos): object 'x' not found
#> Error in eval(expr, envir, enclos): object 'x' not found
y
## Error in eval(expr, envir, enclos): object 'y' not found
#> Error in eval(expr, envir, enclos): object 'y' not found
source2 <- function(path, env = caller_env()) {
file <- paste(readLines(path, warn = FALSE), collapse = "\n")
exprs <- parse_exprs(file)
res <- NULL
for (i in seq_along(exprs)) {
res <- eval(exprs[[i]], env)
}
invisible(res)
}
beware if using this to create functions. use new_function()
or set the src_ref attribute to NULL
1. Carefully read the documentation for source(). What environment does it use by default? What if you supply local = TRUE? How do you provide a custom environment?
default is global environment; local=TRUE os calling environment. local=env sets environment to env
eval(expr(eval(expr(eval(expr(2 + 2)))))) # 4
## [1] 4
eval(eval(expr(eval(expr(eval(expr(2 + 2))))))) # 4
## [1] 4
expr(eval(expr(eval(expr(eval(expr(2 + 2))))))) # eval(expr(eval(expr(eval(expr(2 + 2))))))
## eval(expr(eval(expr(eval(expr(2 + 2))))))
3. Fill in the function bodies below to re-implement get() using sym() and eval(), and assign() using sym(), expr(), and eval(). Don’t worry about the multiple ways of choosing an environment that get() and assign() support; assume that the user supplies it explicitly.
# name is a string
myenv <- env(x=10,y=3)
get2 <- function(name, env) {
name <- sym(name)
eval(name, env=env)
}
get("y", myenv)
## [1] 3
get2("y", myenv)
## [1] 3
myenv <- env(x=10,y=3)
assign2 <- function(name, value, env) {
name <- sym(name)
#eval(expr(name <- value))
eval(expr(!!name <- !!value), envir = env)
}
assign2("z", 5, myenv)
myenv$z
## [1] 5
4 Modify source2() so it returns the result of every expression, not just the last one. Can you eliminate the for loop?
writeLines("4+3
10*10
a <- 'apple'
toupper(a)",
"sourceme20.R")
source2 <- function(path, env = caller_env()) {
file <- paste(readLines(path, warn = FALSE), collapse = "\n")
exprs <- parse_exprs(file)
res <- NULL
for (i in seq_along(exprs)) {
res <- eval(exprs[[i]], env)
}
invisible(res)
}
source3 <- function(path, env = caller_env()) {
file <- paste(readLines(path, warn = FALSE), collapse = "\n")
exprs <- parse_exprs(file)
res <- list()
for (i in seq_along(exprs)) {
res[[i]] <- eval(exprs[[i]], env)
}
invisible(res)
}
s1 <- source("sourceme20.R")
cat("S1\n")
## S1
s1
## $value
## [1] "APPLE"
##
## $visible
## [1] TRUE
s2 <- source2("sourceme20.R")
cat("\n-------\nS2\n")
##
## -------
## S2
s2
## [1] "APPLE"
s3 <- source3("sourceme20.R")
cat("\n-------\nS3\n")
##
## -------
## S3
s3
## [[1]]
## [1] 7
##
## [[2]]
## [1] 100
##
## [[3]]
## [1] "apple"
##
## [[4]]
## [1] "APPLE"
Contain both an expression and an environment
Use enquo
and enquos
to capture user provided expressions
foo <- function(x) enquo(x)
foo(a + b)
## <quosure>
## expr: ^a + b
## env: global
Or quo
and quos
but this isn't recommended
Or new_quosure
new_quosure(expr(x + y), env(x = 1, y = 10))
## <quosure>
## expr: ^x + y
## env: 0x1209ee510
Evaluate with eval_tidy
q1 <- new_quosure(expr(x + y), env(x = 1, y = 10))
eval_tidy(q1)
## [1] 11
q1 <- new_quosure(expr(x), env(x = 1))
q1
## <quosure>
## expr: ^x
## env: 0x12142bb30
eval_tidy(q1)
## [1] 1
## 1
q2 <- new_quosure(expr(x + !!q1), env(x = 10))
q2
## <quosure>
## expr: ^x + (^x)
## env: 0x1215ec070
eval_tidy(q2)
## [1] 11
## 11
q3 <- new_quosure(expr(x + !!q2), env(x = 100))
q3
## <quosure>
## expr: ^x + (^x + (^x))
## env: 0x121996940
eval_tidy(q3)
## [1] 111
## 111
2. Write an enenv() function that captures the environment associated with an argument. (Hint: this should only require two function calls.)
enenv <- function(x) {
get_env(enquo(x))
}
enenv(10)
## <environment: R_EmptyEnv>
Z <- 10
enenv(z)
## <environment: R_GlobalEnv>
data masks
a data fram argument provided to tidy_eval where variables are looked for.
If neded, can specify .data$x and .env$x to be explicit about where variables should be looked for
1. Why did I use a for loop in transform2() instead of map()? Consider transform2(df, x = x * 2, x = x * 2).
Because map will create a list with 2 columns of the same name, and they will not be evaluated recursively (for map, both operations work on the original x, I think)
subset2 <- function(data, rows) {
rows <- enquo(rows)
rows_val <- eval_tidy(rows, data)
stopifnot(is.logical(rows_val))
data[rows_val, , drop = FALSE]
}
subset3 <- function(data, rows) {
rows <- enquo(rows)
eval_tidy(expr(data[!!rows, , drop = FALSE]), data = data)
}
df <- data.frame(x = 1:3)
subset3(df, x == 1)
## x
## 1 1
I would think that these would both work equally well. subset2 is clearer coding and also will maybe give a better error message. Also can get into trouble if the df has a column "data"
3. The following function implements the basics of dplyr::arrange(). Annotate each line with a comment explaining what it does. Can you explain why !!.na.last is strictly correct, but omitting the !! is unlikely to cause problems?
arrange2 <- function(.df, ..., .na.last = TRUE) {
args <- enquos(...) ## get the ... expressions and quote them
order_call <- expr(order(!!!args, na.last = !!.na.last)) ## screate an expression that will call the function "order" with the arugments (which will be column names)
ord <- eval_tidy(order_call, .df) # now run the order call, using the data frame as the environment
stopifnot(length(ord) == nrow(.df)) # make sure nothing went wrong
.df[ord, , drop = FALSE] # actually reorder the df and return it
}
!!.na.last will give the value of na.last as an argument in the expression, but without this it will be evaluated later and this should be ok becuase na.last will be in the calling environment.
1. I’ve included an alternative implementation of threshold_var() below. What makes it different to the approach I used above? What makes it harder?
threshold_var <- function(df, var, val) {
var <- ensym(var)
subset2(df, `$`(.data, !!var) >= !!val)
}
Here we are using the inilne version of the $
function, and that makes the code harder to read.
lm3a <- function(formula, data) {
formula <- enexpr(formula)
lm_call <- expr(lm(!!formula, data = data))
eval(lm_call, caller_env())
}
lm3a(mpg ~ disp, mtcars)$call
It fails because data
does not exist in caller_env()
. So we need to quote it and then unquote it in the call, as is done in the lm3
given in the book
2. When model building, typically the response and data are relatively constant while you rapidly experiment with different predictors. Write a small wrapper that allows you to reduce duplication in the code below.
lm(mpg ~ disp, data = mtcars)
##
## Call:
## lm(formula = mpg ~ disp, data = mtcars)
##
## Coefficients:
## (Intercept) disp
## 29.59985 -0.04122
lm(mpg ~ I(1 / disp), data = mtcars)
##
## Call:
## lm(formula = mpg ~ I(1/disp), data = mtcars)
##
## Coefficients:
## (Intercept) I(1/disp)
## 10.75 1557.67
lm(mpg ~ disp * cyl, data = mtcars)
##
## Call:
## lm(formula = mpg ~ disp * cyl, data = mtcars)
##
## Coefficients:
## (Intercept) disp cyl disp:cyl
## 49.03721 -0.14553 -3.40524 0.01585
What is the idea? give the formulas in ... and loop through them?
A second approach would be to have one arugment for the response, and have the predictors in ...
Start with idea one.
lms_1 <- function(..., data, env=caller_env()) {
formulas <- enexprs(...)
data <- enexpr(data)
res <- list()
for(i in seq_along(formulas)) {
lm_call <- expr(lm(!!formulas[[i]], data=!!data))
expr_print(lm_call)
res[[i]] <- eval(lm_call, env=env)
cat("----------------\n\n")
}
res
}
lms_1(mpg ~ disp, mpg ~ I(1/disp), mpg ~ disp * cyl, data=mtcars)
## lm(mpg ~ disp, data = mtcars)
## ----------------
##
## lm(mpg ~ I(1 / disp), data = mtcars)
## ----------------
##
## lm(mpg ~ disp * cyl, data = mtcars)
## ----------------
## [[1]]
##
## Call:
## lm(formula = mpg ~ disp, data = mtcars)
##
## Coefficients:
## (Intercept) disp
## 29.59985 -0.04122
##
##
## [[2]]
##
## Call:
## lm(formula = mpg ~ I(1/disp), data = mtcars)
##
## Coefficients:
## (Intercept) I(1/disp)
## 10.75 1557.67
##
##
## [[3]]
##
## Call:
## lm(formula = mpg ~ disp * cyl, data = mtcars)
##
## Coefficients:
## (Intercept) disp cyl disp:cyl
## 49.03721 -0.14553 -3.40524 0.01585
How about a list of predictors?
lms_2 <- function(response, ..., data, env=caller_env()) {
response <- enexpr(response)
predictors <- enexprs(...)
data <- enexpr(data)
res <- list()
for(i in seq_along(predictors)) {
lm_call <- expr(lm(!!response ~ !!predictors[[i]], data=!!data))
expr_print(lm_call)
res[[i]] <- eval(lm_call, env=env)
cat("----------------\n\n")
}
res
}
lms_2(mpg, disp, I(1/disp), disp * cyl, data=mtcars)
## lm(mpg ~ disp, data = mtcars)
## ----------------
##
## lm(mpg ~ I(1 / disp), data = mtcars)
## ----------------
##
## lm(mpg ~ disp * cyl, data = mtcars)
## ----------------
## [[1]]
##
## Call:
## lm(formula = mpg ~ disp, data = mtcars)
##
## Coefficients:
## (Intercept) disp
## 29.59985 -0.04122
##
##
## [[2]]
##
## Call:
## lm(formula = mpg ~ I(1/disp), data = mtcars)
##
## Coefficients:
## (Intercept) I(1/disp)
## 10.75 1557.67
##
##
## [[3]]
##
## Call:
## lm(formula = mpg ~ disp * cyl, data = mtcars)
##
## Coefficients:
## (Intercept) disp cyl disp:cyl
## 49.03721 -0.14553 -3.40524 0.01585
3. Another way to write resample_lm() would be to include the resample expression (data[sample(nrow(data), replace = TRUE), , drop = FALSE]) in the data argument. Implement that approach. What are the advantages? What are the disadvantages?
advantage: user control. disadvantage: user has to rewrite it if there df is not called "data". Uglier. lm output is uglier.
resample_lm3 <- function(formula, data=data[sample(nrow(data), replace = TRUE), , drop = FALSE], env = caller_env()) {
formula <- enexpr(formula)
data = enexpr(data)
lm_call <- expr(lm(!!formula, data = !!data))
expr_print(lm_call)
eval(lm_call, env)
}
df <- data.frame(x = 1:10, y = 5 + 3 * (1:10) + round(rnorm(10), 2))
resample_lm3(y ~ x, data=df[sample(nrow(df), replace = TRUE), , drop = FALSE])
## lm(y ~ x, data = df[sample(nrow(df), replace = TRUE), , drop = FALSE])
##
## Call:
## lm(formula = y ~ x, data = df[sample(nrow(df), replace = TRUE),
## , drop = FALSE])
##
## Coefficients:
## (Intercept) x
## 2.770 3.327