diff --git a/DESCRIPTION b/DESCRIPTION index dd0cae2..7fdaea5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -15,6 +15,10 @@ Authors@R: family = "Hayes", role = c("aut"), comment = c(ORCID = "0000-0002-4985-5160")), + person(given = "Rob", + family = "Hyndman", + role = c("aut"), + comment = c(ORCID = "0000-0002-2140-5352")), person(given = "Earo", family = "Wang", role = c("ctb"), @@ -46,6 +50,7 @@ Suggests: covr, mvtnorm, actuar (>= 2.0.0), + evd, ggdist, ggplot2 RdMacros: @@ -55,4 +60,4 @@ BugReports: https://github.com/mitchelloharawild/distributional/issues Encoding: UTF-8 Language: en-GB Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 diff --git a/NAMESPACE b/NAMESPACE index 3a82b07..882b26e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -143,6 +143,14 @@ S3method(dim,dist_default) S3method(dim,dist_multinomial) S3method(dim,dist_mvnorm) S3method(dimnames,distribution) +S3method(distributional::cdf,dist_gev) +S3method(distributional::cdf,dist_gpd) +S3method(distributional::covariance,dist_gev) +S3method(distributional::covariance,dist_gpd) +S3method(distributional::generate,dist_gev) +S3method(distributional::generate,dist_gpd) +S3method(distributional::log_density,dist_gev) +S3method(distributional::log_density,dist_gpd) S3method(family,dist_default) S3method(family,distribution) S3method(format,dist_bernoulli) @@ -158,6 +166,8 @@ S3method(format,dist_exponential) S3method(format,dist_f) S3method(format,dist_gamma) S3method(format,dist_geometric) +S3method(format,dist_gev) +S3method(format,dist_gpd) S3method(format,dist_gh) S3method(format,dist_gk) S3method(format,dist_gumbel) @@ -323,6 +333,8 @@ S3method(mean,dist_exponential) S3method(mean,dist_f) S3method(mean,dist_gamma) S3method(mean,dist_geometric) +S3method(mean,dist_gev) +S3method(mean,dist_gpd) S3method(mean,dist_gumbel) S3method(mean,dist_hypergeometric) S3method(mean,dist_inflated) @@ -422,6 +434,12 @@ S3method(skewness,dist_sample) S3method(skewness,dist_uniform) S3method(skewness,dist_weibull) S3method(skewness,distribution) +S3method(stats::density,dist_gev) +S3method(stats::density,dist_gpd) +S3method(stats::median,dist_gev) +S3method(stats::median,dist_gpd) +S3method(stats::quantile,dist_gev) +S3method(stats::quantile,dist_gpd) S3method(sum,distribution) S3method(support,dist_categorical) S3method(support,dist_default) @@ -467,6 +485,8 @@ export(dist_exponential) export(dist_f) export(dist_gamma) export(dist_geometric) +export(dist_gev) +export(dist_gpd) export(dist_gh) export(dist_gk) export(dist_gumbel) diff --git a/NEWS.md b/NEWS.md index de7b2e7..f467cb0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -13,6 +13,8 @@ * `support()` now shows whether the interval of support is open or closed (@venpopov, #97) +* Added `dist_gev()` for the Generalised Extreme Value distribution and + `dist_gpd()` for the Generalised Pareto distribution (@robjhyndman, #124). ### Probability distributions diff --git a/R/dist_gev.R b/R/dist_gev.R new file mode 100644 index 0000000..adc1174 --- /dev/null +++ b/R/dist_gev.R @@ -0,0 +1,132 @@ +#' The Generalized Extreme Value Distribution +#' +#' The GEV distribution function with parameters \eqn{\code{location} = a}, +#' \eqn{\code{scale} = b} and \eqn{\code{shape} = s} is +#' +#' \deqn{F(x) = \exp\left[-\{1+s(x-a)/b\}^{-1/s}\right]} +#' +#' for \eqn{1+s(x-a)/b > 0}, where \eqn{b > 0}. If \eqn{s = 0} the distribution +#' is defined by continuity, giving +#' +#' \deqn{F(x) = \exp\left[-\exp\left(-\frac{x-a}{b}\right)\right]} +#' +#' The support of the distribution is the real line if \eqn{s = 0}, +#' \eqn{x \geq a - b/s} if \eqn{s \neq 0}, and +#' \eqn{x \leq a - b/s} if \eqn{s < 0}. +#' +#' The parametric form of the GEV encompasses that of the Gumbel, Frechet and +#' reverse Weibull distributions, which are obtained for \eqn{s = 0}, +#' \eqn{s > 0} and \eqn{s < 0} respectively. It was first introduced by +#' Jenkinson (1955). +#' +#' @references Jenkinson, A. F. (1955) The frequency distribution of the annual +#' maximum (or minimum) of meteorological elements. \emph{Quart. J. R. Met. Soc.}, +#' \bold{81}, 158–171. +#' @param location the location parameter \eqn{a} of the GEV distribution. +#' @param scale the scale parameter \eqn{b} of the GEV distribution. +#' @param shape the shape parameter \eqn{s} of the GEV distribution. +#' @seealso \code{\link[evd]{gev}} +#' @examples +#' dist <- dist_gev(location = 0, scale = 1, shape = 0) +#' @export + +dist_gev <- function(location, scale, shape) { + location <- vctrs::vec_cast(unname(location), double()) + shape <- vctrs::vec_cast(unname(shape), double()) + scale <- vctrs::vec_cast(unname(scale), double()) + if (any(scale <= 0)) { + stop("The scale parameter of a GEV distribution must be strictly positive") + } + distributional::new_dist(location = location, scale = scale, shape = shape, class = "dist_gev") +} + +#' @export +format.dist_gev <- function(x, digits = 2, ...) { + sprintf( + "GEV(%s, %s, %s)", + format(x[["location"]], digits = digits, ...), + format(x[["scale"]], digits = digits, ...), + format(x[["shape"]], digits = digits, ...) + ) +} + +#' @exportS3Method distributional::log_density +log_density.dist_gev <- function(x, at, ...) { + z <- (at - x[["location"]]) / x[["scale"]] + if (x[["shape"]] == 0) { + pdf <- -z - exp(-z) + } else { + xx <- 1 + x[["shape"]] * z + xx[xx <= 0] <- NA_real_ + pdf <- -xx^(-1 / x[["shape"]]) - (1 / x[["shape"]] + 1) * log(xx) + pdf[is.na(pdf)] <- -Inf + } + pdf - log(x[["scale"]]) +} + +#' @exportS3Method stats::density +density.dist_gev <- function(x, at, ...) { + exp(log_density.dist_gev(x, at, ...)) +} + +#' @exportS3Method distributional::cdf +cdf.dist_gev <- function(x, q, ...) { + z <- (q - x[["location"]]) / x[["scale"]] + if (x[["shape"]] == 0) { + exp(-exp(-z)) + } else { + exp(-pmax(1 + x[["shape"]] * z, 0)^(-1 / x[["shape"]])) + } +} + +#' @exportS3Method stats::quantile +quantile.dist_gev <- function(x, p, ...) { + if (x[["shape"]] == 0) { + x[["location"]] - x[["scale"]] * log(-log(p)) + } else { + x[["location"]] + x[["scale"]] * ((-log(p))^(-x[["shape"]]) - 1) / x[["shape"]] + } +} + +#' @exportS3Method distributional::generate +generate.dist_gev <- function(x, times, ...) { + z <- stats::rexp(times) + if (x[["shape"]] == 0) { + x[["location"]] - x[["scale"]] * log(z) + } else { + x[["location"]] + x[["scale"]] * (z^(-x[["shape"]]) - 1) / x[["shape"]] + } +} + +#' @export +mean.dist_gev <- function(x, ...) { + if (x[["shape"]] == 0) { + x[["location"]] + x[["scale"]] * 0.57721566490153286 + } else if (x[["shape"]] < 1) { + x[["location"]] + x[["scale"]] * (gamma(1 - x[["shape"]]) - 1) / x[["shape"]] + } else { + Inf + } +} + +#' @exportS3Method stats::median +median.dist_gev <- function(x, ...) { + if (x[["shape"]] == 0) { + x[["location"]] - x[["scale"]] * log(log(2)) + } else { + x[["location"]] + x[["scale"]] * (log(2)^(-x[["shape"]]) - 1) / x[["shape"]] + } +} + +#' @exportS3Method distributional::covariance +covariance.dist_gev <- function(x, ...) { + if (x[["shape"]] == 0) { + x[["scale"]]^2 * pi^2 / 6 + } else if (x[["shape"]] < 0.5) { + g2 <- gamma(1 - 2 * x[["shape"]]) + g1 <- gamma(1 - x[["shape"]]) + x[["scale"]]^2 * (g2 - g1^2) / x[["shape"]]^2 + } else { + Inf + } +} diff --git a/R/dist_gpd.R b/R/dist_gpd.R new file mode 100644 index 0000000..e5c95ba --- /dev/null +++ b/R/dist_gpd.R @@ -0,0 +1,124 @@ +#' The Generalized Pareto Distribution +#' +#' The GPD distribution function with parameters \eqn{\code{location} = a}, +#' \eqn{\code{scale} = b} and \eqn{\code{shape} = s} is +#' +#' \deqn{F(x) = 1 - \left(1+s(x-a)/b\right)^{-1/s}} +#' +#' for \eqn{1+s(x-a)/b > 0}, where \eqn{b > 0}. If \eqn{s = 0} the distribution +#' is defined by continuity, giving +#' +#' \deqn{F(x) = 1 - \exp\left(-\frac{x-a}{b}\right)} +#' +#' The support of the distribution is \eqn{x \geq a} if \eqn{s \geq 0}, and +#' \eqn{a \leq x \leq a -b/s} if \eqn{s < 0}. +#' +#' The Pickands–Balkema–De Haan theorem states that for a large class of +#' distributions, the tail (above some threshold) can be approximated by a GPD. +#' +#' @param location the location parameter \eqn{a} of the GPD distribution. +#' @param scale the scale parameter \eqn{b} of the GPD distribution. +#' @param shape the shape parameter \eqn{s} of the GPD distribution. +#' @seealso \code{\link[evd]{gpd}} +#' @examples +#' dist <- dist_gpd(location = 0, scale = 1, shape = 0) +#' @export + +dist_gpd <- function(location, scale, shape) { + location <- vctrs::vec_cast(unname(location), double()) + shape <- vctrs::vec_cast(unname(shape), double()) + scale <- vctrs::vec_cast(unname(scale), double()) + if (any(scale <= 0)) { + stop("The scale parameter of a GPD distribution must be strictly positive") + } + distributional::new_dist(location = location, scale = scale, shape = shape, class = "dist_gpd") +} + +#' @export +format.dist_gpd <- function(x, digits = 2, ...) { + sprintf( + "GPD(%s, %s, %s)", + format(x[["location"]], digits = digits, ...), + format(x[["scale"]], digits = digits, ...), + format(x[["shape"]], digits = digits, ...) + ) +} + +#' @exportS3Method distributional::log_density +log_density.dist_gpd <- function(x, at, ...) { + z <- (at - x[["location"]]) / x[["scale"]] + if (x[["shape"]] == 0) { + pdf <- -z + } else { + xx <- 1 + x[["shape"]] * z + xx[xx <= 0] <- NA_real_ + pdf <- -(1 / x[["shape"]] + 1) * log(xx) + pdf[is.na(pdf)] <- -Inf + } + if (x[["shape"]] >= 0) { + pdf[z < 0] <- -Inf + } else { + pdf[z < 0 | z > -1 / x[["shape"]]] <- -Inf + } + pdf - log(x[["scale"]]) +} + +#' @exportS3Method stats::density +density.dist_gpd <- function(x, at, ...) { + exp(log_density.dist_gpd(x, at, ...)) +} + +#' @exportS3Method distributional::cdf +cdf.dist_gpd <- function(x, q, ...) { + z <- pmax(q - x[["location"]], 0) / x[["scale"]] + if (x[["shape"]] == 0) { + 1 - exp(-z) + } else { + 1 - pmax(1 + x[["shape"]] * z, 0)^(-1 / x[["shape"]]) + } +} + +#' @exportS3Method stats::quantile +quantile.dist_gpd <- function(x, p, ...) { + if (x[["shape"]] == 0) { + x[["location"]] - x[["scale"]] * log(1 - p) + } else { + x[["location"]] + x[["scale"]] * ((1 - p)^(-x[["shape"]]) - 1) / x[["shape"]] + } +} + +#' @exportS3Method distributional::generate +generate.dist_gpd <- function(x, times, ...) { + if (x[["shape"]] == 0) { + x[["location"]] + x[["scale"]] * stats::rexp(times) + } else { + quantile(x, stats::runif(times)) + } +} + +#' @export +mean.dist_gpd <- function(x, ...) { + if (x[["shape"]] < 1) { + x[["location"]] + x[["scale"]] / (1 - x[["shape"]]) + } else { + Inf + } +} + +#' @exportS3Method stats::median +median.dist_gpd <- function(x, ...) { + if (x[["shape"]] == 0) { + x[["location"]] - x[["scale"]] * log(0.5) + } else { + x[["location"]] + x[["scale"]] * (2^x[["shape"]] - 1) / x[["shape"]] + } +} + +#' @exportS3Method distributional::covariance +covariance.dist_gpd <- function(x, ...) { + if (x[["shape"]] < 0.5) { + x[["scale"]]^2 / (1 - x[["shape"]])^2 / (1 - 2 * x[["shape"]]) + } else { + Inf + } +} diff --git a/man/dist_gev.Rd b/man/dist_gev.Rd new file mode 100644 index 0000000..b582776 --- /dev/null +++ b/man/dist_gev.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dist_gev.R +\name{dist_gev} +\alias{dist_gev} +\title{The Generalized Extreme Value Distribution} +\usage{ +dist_gev(location, scale, shape) +} +\arguments{ +\item{location}{the location parameter \eqn{a} of the GEV distribution.} + +\item{scale}{the scale parameter \eqn{b} of the GEV distribution.} + +\item{shape}{the shape parameter \eqn{s} of the GEV distribution.} +} +\description{ +The GEV distribution function with parameters \eqn{\code{location} = a}, +\eqn{\code{scale} = b} and \eqn{\code{shape} = s} is +} +\details{ +\deqn{F(x) = \exp\left[-\{1+s(x-a)/b\}^{-1/s}\right]} + +for \eqn{1+s(x-a)/b > 0}, where \eqn{b > 0}. If \eqn{s = 0} the distribution +is defined by continuity, giving + +\deqn{F(x) = \exp\left[-\exp\left(-\frac{x-a}{b}\right)\right]} + +The support of the distribution is the real line if \eqn{s = 0}, +\eqn{x \geq a - b/s} if \eqn{s \neq 0}, and +\eqn{x \leq a - b/s} if \eqn{s < 0}. + +The parametric form of the GEV encompasses that of the Gumbel, Frechet and +reverse Weibull distributions, which are obtained for \eqn{s = 0}, +\eqn{s > 0} and \eqn{s < 0} respectively. It was first introduced by +Jenkinson (1955). +} +\examples{ +dist <- dist_gev(location = 0, scale = 1, shape = 0) +} +\references{ +Jenkinson, A. F. (1955) The frequency distribution of the annual +maximum (or minimum) of meteorological elements. \emph{Quart. J. R. Met. Soc.}, +\bold{81}, 158–171. +} +\seealso{ +\code{\link[evd]{gev}} +} diff --git a/man/dist_gpd.Rd b/man/dist_gpd.Rd new file mode 100644 index 0000000..b6c2e9c --- /dev/null +++ b/man/dist_gpd.Rd @@ -0,0 +1,39 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dist_gpd.R +\name{dist_gpd} +\alias{dist_gpd} +\title{The Generalized Pareto Distribution} +\usage{ +dist_gpd(location, scale, shape) +} +\arguments{ +\item{location}{the location parameter \eqn{a} of the GPD distribution.} + +\item{scale}{the scale parameter \eqn{b} of the GPD distribution.} + +\item{shape}{the shape parameter \eqn{s} of the GPD distribution.} +} +\description{ +The GPD distribution function with parameters \eqn{\code{location} = a}, +\eqn{\code{scale} = b} and \eqn{\code{shape} = s} is +} +\details{ +\deqn{F(x) = 1 - \left(1+s(x-a)/b\right)^{-1/s}} + +for \eqn{1+s(x-a)/b > 0}, where \eqn{b > 0}. If \eqn{s = 0} the distribution +is defined by continuity, giving + +\deqn{F(x) = 1 - \exp\left(-\frac{x-a}{b}\right)} + +The support of the distribution is \eqn{x \geq a} if \eqn{s \geq 0}, and +\eqn{a \leq x \leq a -b/s} if \eqn{s < 0}. + +The Pickands–Balkema–De Haan theorem states that for a large class of +distributions, the tail (above some threshold) can be approximated by a GPD. +} +\examples{ +dist <- dist_gpd(location = 0, scale = 1, shape = 0) +} +\seealso{ +\code{\link[evd]{gpd}} +} diff --git a/man/distributional-package.Rd b/man/distributional-package.Rd index 338a2ac..4fc788c 100644 --- a/man/distributional-package.Rd +++ b/man/distributional-package.Rd @@ -24,6 +24,7 @@ Authors: \itemize{ \item Matthew Kay (\href{https://orcid.org/0000-0001-9446-0419}{ORCID}) \item Alex Hayes (\href{https://orcid.org/0000-0002-4985-5160}{ORCID}) + \item Rob Hyndman (\href{https://orcid.org/0000-0002-2140-5352}{ORCID}) } Other contributors: diff --git a/tests/testthat/test_gev.R b/tests/testthat/test_gev.R new file mode 100644 index 0000000..874fad8 --- /dev/null +++ b/tests/testthat/test_gev.R @@ -0,0 +1,58 @@ +test_that("GEV", { + dist <- dist_gev(location = c(0, .5, 0), scale = c(1, 2, 3), shape = c(0, 0.1, 1.1)) + euler <- 0.57721566490153286 # Euler's constant + # Mean + expect_equal(mean(dist), c( + euler, + 0.5 + 2 * (gamma(0.9) - 1) / 0.1, # location + scale*(gamma(1 - shape) - 1)/shape + Inf # Since shape >= 1 + ), tolerance = 0.0001) + # Median + expect_equal(median(dist), c( + -log(log(2)), + 0.5 + 2 * (log(2)^(-0.1) - 1) / 0.1, # location + scale*(log(2)^(-shape) - 1)/shape + 3 * (log(2)^(-1.1) - 1) / 1.1 # location + scale*(log(2)^(-shape) - 1)/shape + ), tolerance = 0.0001) + expect_equal(median(dist), quantile(dist, 0.5)) + # Variance + expect_equal(distributional::variance(dist), c( + pi^2 / 6, + 2^2 * (gamma(1 - 2 * 0.1) - gamma(1 - 0.1)^2) / 0.1^2, # scale^2 * (g2 - g1^2)/shape^2 + Inf # since shape >= 0.5 + ), tolerance = 0.0001) + # Density + at <- (0:20) / 10 + expect_equal(density(dist, at), list( + evd::dgev(at, loc = 0, scale = 1, shape = 0), + evd::dgev(at, loc = 0.5, scale = 2, shape = 0.1), + evd::dgev(at, loc = 0, scale = 3, shape = 1.1) + )) + # CDF + expect_equal(distributional::cdf(dist, at), list( + evd::pgev(at, loc = 0, scale = 1, shape = 0), + evd::pgev(at, loc = 0.5, scale = 2, shape = 0.1), + evd::pgev(at, loc = 0, scale = 3, shape = 1.1) + )) + # Quantiles + p <- (1:19) / 20 + expect_equal(quantile(dist, p = p), list( + evd::qgev(p = p, loc = 0, scale = 1, shape = 0), + evd::qgev(p = p, loc = 0.5, scale = 2, shape = 0.1), + evd::qgev(p = p, loc = 0, scale = 3, shape = 1.1) + )) + # Generate + set.seed(123) + rand_dist <- distributional::generate(dist, times = 1e6) + expect_equal(lapply(rand_dist[1:2], mean) |> unlist(), + mean(dist)[1:2], + tolerance = 0.01 + ) + expect_equal(lapply(rand_dist[1:2], var) |> unlist(), + distributional::variance(dist)[1:2], + tolerance = 0.01 + ) + expect_equal(lapply(rand_dist, median) |> unlist(), + median(dist), + tolerance = 0.01 + ) +}) diff --git a/tests/testthat/test_gpd.R b/tests/testthat/test_gpd.R new file mode 100644 index 0000000..b40172f --- /dev/null +++ b/tests/testthat/test_gpd.R @@ -0,0 +1,57 @@ +test_that("GPD", { + dist <- dist_gpd(location = c(0, .5, 0), scale = c(1, 2, 3), shape = c(0, 0.1, 1.1)) + # Mean + expect_equal(mean(dist), c( + 1, + 0.5 + 2 / 0.9, # location + scale/(1 - shape) + Inf # Since shape >= 1 + ), tolerance = 0.0001) + # Median + expect_equal(median(dist), c( + -log(0.5), + 0.5 + 2 * (2^0.1 - 1) / 0.1, # location + scale * (2^shape - 1) / shape + 3 * (2^1.1 - 1) / 1.1 # location + scale * (2^shape - 1) / shape + ), tolerance = 0.0001) + expect_equal(median(dist), quantile(dist, 0.5)) + # Variance + expect_equal(distributional::variance(dist), c( + 1, + 2^2 / 0.9^2 / (1 - 2 * 0.1), # scale^2 / (1 - shape)^2 / (1 - 2 * shape) + Inf # since shape >= 0.5 + ), tolerance = 0.0001) + # Density + at <- (0:20) / 10 + 1e-8 # Avoiding being on the boundary where evd gives wrong result + expect_equal(density(dist, at), list( + evd::dgpd(at, loc = 0, scale = 1, shape = 0), + evd::dgpd(at, loc = 0.5, scale = 2, shape = 0.1), + evd::dgpd(at, loc = 0, scale = 3, shape = 1.1) + )) + # CDF + expect_equal(distributional::cdf(dist, at), list( + evd::pgpd(at, loc = 0, scale = 1, shape = 0), + evd::pgpd(at, loc = 0.5, scale = 2, shape = 0.1), + evd::pgpd(at, loc = 0, scale = 3, shape = 1.1) + )) + # Quantiles + p <- (1:19) / 20 + expect_equal(quantile(dist, p = p), list( + evd::qgpd(p = p, loc = 0, scale = 1, shape = 0), + evd::qgpd(p = p, loc = 0.5, scale = 2, shape = 0.1), + evd::qgpd(p = p, loc = 0, scale = 3, shape = 1.1) + )) + # Generate + set.seed(123) + rand_dist <- distributional::generate(dist, times = 1e6) + expect_equal(lapply(rand_dist[1:2], mean) |> unlist(), + mean(dist)[1:2], + tolerance = 0.01 + ) + expect_equal(lapply(rand_dist[1:2], var) |> unlist(), + distributional::variance(dist)[1:2], + tolerance = 0.01 + ) + expect_equal(lapply(rand_dist, median) |> unlist(), + median(dist), + tolerance = 0.01 + ) +})