diff --git a/botorch/acquisition/acquisition.py b/botorch/acquisition/acquisition.py
index c7a8ccbd64..60c5558482 100644
--- a/botorch/acquisition/acquisition.py
+++ b/botorch/acquisition/acquisition.py
@@ -10,7 +10,6 @@
 
 import warnings
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 from botorch.exceptions import BotorchWarning
@@ -41,7 +40,7 @@ def __init__(self, model: Model) -> None:
         super().__init__()
         self.model: Model = model
 
-    def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
+    def set_X_pending(self, X_pending: Tensor | None = None) -> None:
         r"""Informs the acquisition function about pending design points.
 
         Args:
@@ -115,7 +114,7 @@ class MCSamplerMixin(ABC):
 
     _default_sample_shape = torch.Size([512])
 
-    def __init__(self, sampler: Optional[MCSampler] = None) -> None:
+    def __init__(self, sampler: MCSampler | None = None) -> None:
         r"""Register the sampler on the acquisition function.
 
         Args:
diff --git a/botorch/acquisition/active_learning.py b/botorch/acquisition/active_learning.py
index 4125a88b11..caa5eb9082 100644
--- a/botorch/acquisition/active_learning.py
+++ b/botorch/acquisition/active_learning.py
@@ -23,7 +23,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from botorch import settings
@@ -53,9 +52,9 @@ def __init__(
         self,
         model: Model,
         mc_points: Tensor,
-        sampler: Optional[MCSampler] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        sampler: MCSampler | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
     ) -> None:
         r"""q-Integrated Negative Posterior Variance.
 
@@ -140,7 +139,7 @@ def __init__(
         self,
         model: Model,
         objective: MCAcquisitionObjective,
-        sampler: Optional[MCSampler] = None,
+        sampler: MCSampler | None = None,
     ) -> None:
         r"""Pairwise Monte Carlo Posterior Variance
 
diff --git a/botorch/acquisition/analytic.py b/botorch/acquisition/analytic.py
index 33e1b6b5d7..fb774eab28 100644
--- a/botorch/acquisition/analytic.py
+++ b/botorch/acquisition/analytic.py
@@ -16,7 +16,6 @@
 from abc import ABC
 from contextlib import nullcontext
 from copy import deepcopy
-from typing import Optional, Union
 
 import torch
 from botorch.acquisition.acquisition import AcquisitionFunction
@@ -52,7 +51,7 @@ class AnalyticAcquisitionFunction(AcquisitionFunction, ABC):
     def __init__(
         self,
         model: Model,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> None:
         r"""Base constructor for analytic acquisition functions.
 
@@ -76,14 +75,14 @@ def __init__(
                 )
         self.posterior_transform = posterior_transform
 
-    def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
+    def set_X_pending(self, X_pending: Tensor | None = None) -> None:
         raise UnsupportedError(
             "Analytic acquisition functions do not account for X_pending yet."
         )
 
     def _mean_and_sigma(
         self, X: Tensor, compute_sigma: bool = True, min_var: float = 1e-12
-    ) -> tuple[Tensor, Optional[Tensor]]:
+    ) -> tuple[Tensor, Tensor | None]:
         """Computes the first and second moments of the model posterior.
 
         Args:
@@ -135,8 +134,8 @@ class LogProbabilityOfImprovement(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
-        posterior_transform: Optional[PosteriorTransform] = None,
+        best_f: float | Tensor,
+        posterior_transform: PosteriorTransform | None = None,
         maximize: bool = True,
     ):
         r"""Single-outcome Probability of Improvement.
@@ -189,8 +188,8 @@ class ProbabilityOfImprovement(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
-        posterior_transform: Optional[PosteriorTransform] = None,
+        best_f: float | Tensor,
+        posterior_transform: PosteriorTransform | None = None,
         maximize: bool = True,
     ):
         r"""Single-outcome Probability of Improvement.
@@ -237,8 +236,8 @@ class qAnalyticProbabilityOfImprovement(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
-        posterior_transform: Optional[PosteriorTransform] = None,
+        best_f: float | Tensor,
+        posterior_transform: PosteriorTransform | None = None,
         maximize: bool = True,
     ) -> None:
         """qPI using an analytic approximation.
@@ -314,8 +313,8 @@ class ExpectedImprovement(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
-        posterior_transform: Optional[PosteriorTransform] = None,
+        best_f: float | Tensor,
+        posterior_transform: PosteriorTransform | None = None,
         maximize: bool = True,
     ):
         r"""Single-outcome Expected Improvement (analytic).
@@ -378,8 +377,8 @@ class LogExpectedImprovement(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
-        posterior_transform: Optional[PosteriorTransform] = None,
+        best_f: float | Tensor,
+        posterior_transform: PosteriorTransform | None = None,
         maximize: bool = True,
     ):
         r"""Logarithm of single-outcome Expected Improvement (analytic).
@@ -447,9 +446,9 @@ class LogConstrainedExpectedImprovement(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
+        best_f: float | Tensor,
         objective_index: int,
-        constraints: dict[int, tuple[Optional[float], Optional[float]]],
+        constraints: dict[int, tuple[float | None, float | None]],
         maximize: bool = True,
     ) -> None:
         r"""Analytic Log Constrained Expected Improvement.
@@ -525,9 +524,9 @@ class ConstrainedExpectedImprovement(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
+        best_f: float | Tensor,
         objective_index: int,
-        constraints: dict[int, tuple[Optional[float], Optional[float]]],
+        constraints: dict[int, tuple[float | None, float | None]],
         maximize: bool = True,
     ) -> None:
         r"""Analytic Constrained Expected Improvement.
@@ -606,7 +605,7 @@ def __init__(
         X_observed: Tensor,
         num_fantasies: int = 20,
         maximize: bool = True,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> None:
         r"""Single-outcome Noisy Log Expected Improvement (via fantasies).
 
@@ -762,8 +761,8 @@ class UpperConfidenceBound(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        beta: Union[float, Tensor],
-        posterior_transform: Optional[PosteriorTransform] = None,
+        beta: float | Tensor,
+        posterior_transform: PosteriorTransform | None = None,
         maximize: bool = True,
     ) -> None:
         r"""Single-outcome Upper Confidence Bound.
@@ -812,7 +811,7 @@ class PosteriorMean(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
         maximize: bool = True,
     ) -> None:
         r"""Single-outcome Posterior Mean.
@@ -857,7 +856,7 @@ def __init__(
         self,
         model: Model,
         weights: Tensor,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> None:
         r"""Scalarized Posterior Mean.
 
@@ -919,7 +918,7 @@ class PosteriorStandardDeviation(AnalyticAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
         maximize: bool = True,
     ) -> None:
         r"""Single-outcome Posterior Mean.
@@ -1135,8 +1134,8 @@ def _get_noiseless_fantasy_model(
 
 
 def _preprocess_constraint_bounds(
-    acqf: Union[LogConstrainedExpectedImprovement, ConstrainedExpectedImprovement],
-    constraints: dict[int, tuple[Optional[float], Optional[float]]],
+    acqf: LogConstrainedExpectedImprovement | ConstrainedExpectedImprovement,
+    constraints: dict[int, tuple[float | None, float | None]],
 ) -> None:
     r"""Set up constraint bounds.
 
@@ -1180,7 +1179,7 @@ def _preprocess_constraint_bounds(
 
 
 def _compute_log_prob_feas(
-    acqf: Union[LogConstrainedExpectedImprovement, ConstrainedExpectedImprovement],
+    acqf: LogConstrainedExpectedImprovement | ConstrainedExpectedImprovement,
     means: Tensor,
     sigmas: Tensor,
 ) -> Tensor:
diff --git a/botorch/acquisition/bayesian_active_learning.py b/botorch/acquisition/bayesian_active_learning.py
index 0c0e8dc53e..0e3ce87eae 100644
--- a/botorch/acquisition/bayesian_active_learning.py
+++ b/botorch/acquisition/bayesian_active_learning.py
@@ -22,7 +22,6 @@
 
 import warnings
 
-from typing import Optional, Union
 
 from botorch.acquisition.acquisition import AcquisitionFunction, MCSamplerMixin
 from botorch.acquisition.objective import PosteriorTransform
@@ -79,10 +78,10 @@ class qBayesianActiveLearningByDisagreement(
 ):
     def __init__(
         self,
-        model: Union[ModelListGP, SaasFullyBayesianSingleTaskGP],
-        sampler: Optional[MCSampler] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        model: ModelListGP | SaasFullyBayesianSingleTaskGP,
+        sampler: MCSampler | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
     ) -> None:
         """
         Batch implementation [kirsch2019batchbald]_ of BALD [Houlsby2011bald]_,
diff --git a/botorch/acquisition/cached_cholesky.py b/botorch/acquisition/cached_cholesky.py
index 674492a080..4910060f51 100644
--- a/botorch/acquisition/cached_cholesky.py
+++ b/botorch/acquisition/cached_cholesky.py
@@ -11,7 +11,6 @@
 from __future__ import annotations
 
 import warnings
-from typing import Optional
 
 import torch
 from botorch.acquisition.acquisition import MCSamplerMixin
@@ -72,7 +71,7 @@ def __init__(
         self,
         model: Model,
         cache_root: bool = False,
-        sampler: Optional[MCSampler] = None,
+        sampler: MCSampler | None = None,
     ) -> None:
         r"""Set class attributes and perform compatibility checks.
 
diff --git a/botorch/acquisition/cost_aware.py b/botorch/acquisition/cost_aware.py
index 9fa6212168..65b9e53f5b 100644
--- a/botorch/acquisition/cost_aware.py
+++ b/botorch/acquisition/cost_aware.py
@@ -13,7 +13,7 @@
 
 import warnings
 from abc import ABC, abstractmethod
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import torch
 from botorch import settings
@@ -35,7 +35,7 @@ class CostAwareUtility(Module, ABC):
 
     @abstractmethod
     def forward(
-        self, X: Tensor, deltas: Tensor, sampler: Optional[MCSampler] = None
+        self, X: Tensor, deltas: Tensor, sampler: MCSampler | None = None
     ) -> Tensor:
         r"""Evaluate the cost-aware utility on the candidates and improvements.
 
@@ -67,7 +67,7 @@ def __init__(self, cost: Callable[[Tensor, Tensor], Tensor]) -> None:
         self._cost_callable: Callable[[Tensor, Tensor], Tensor] = cost
 
     def forward(
-        self, X: Tensor, deltas: Tensor, sampler: Optional[MCSampler] = None
+        self, X: Tensor, deltas: Tensor, sampler: MCSampler | None = None
     ) -> Tensor:
         r"""Evaluate the cost function on the candidates and improvements.
 
@@ -109,9 +109,9 @@ class InverseCostWeightedUtility(CostAwareUtility):
 
     def __init__(
         self,
-        cost_model: Union[DeterministicModel, GPyTorchModel],
+        cost_model: DeterministicModel | GPyTorchModel,
         use_mean: bool = True,
-        cost_objective: Optional[MCAcquisitionObjective] = None,
+        cost_objective: MCAcquisitionObjective | None = None,
         min_cost: float = 1e-2,
     ) -> None:
         r"""Cost-aware utility that weights increase in utility by inverse cost.
@@ -153,8 +153,8 @@ def forward(
         self,
         X: Tensor,
         deltas: Tensor,
-        sampler: Optional[MCSampler] = None,
-        X_evaluation_mask: Optional[Tensor] = None,
+        sampler: MCSampler | None = None,
+        X_evaluation_mask: Tensor | None = None,
     ) -> Tensor:
         r"""Evaluate the cost function on the candidates and improvements. Note
         that negative values of `deltas` are instead scaled by the cost, and not
diff --git a/botorch/acquisition/decoupled.py b/botorch/acquisition/decoupled.py
index 59e79bc050..8969bfea6b 100644
--- a/botorch/acquisition/decoupled.py
+++ b/botorch/acquisition/decoupled.py
@@ -10,7 +10,6 @@
 
 import warnings
 from abc import ABC
-from typing import Optional
 
 import torch
 from botorch.acquisition.acquisition import AcquisitionFunction
@@ -52,7 +51,7 @@ class DecoupledAcquisitionFunction(AcquisitionFunction, ABC):
     """
 
     def __init__(
-        self, model: ModelList, X_evaluation_mask: Optional[Tensor] = None, **kwargs
+        self, model: ModelList, X_evaluation_mask: Tensor | None = None, **kwargs
     ) -> None:
         r"""Initialize.
 
@@ -71,12 +70,12 @@ def __init__(
         self.X_pending = None
 
     @property
-    def X_evaluation_mask(self) -> Optional[Tensor]:
+    def X_evaluation_mask(self) -> Tensor | None:
         r"""Get the evaluation indices for the new candidate."""
         return self._X_evaluation_mask
 
     @X_evaluation_mask.setter
-    def X_evaluation_mask(self, X_evaluation_mask: Optional[Tensor] = None) -> None:
+    def X_evaluation_mask(self, X_evaluation_mask: Tensor | None = None) -> None:
         r"""Set the evaluation indices for the new candidate."""
         if X_evaluation_mask is not None:
             # TODO: Add batch support
@@ -92,8 +91,8 @@ def X_evaluation_mask(self, X_evaluation_mask: Optional[Tensor] = None) -> None:
 
     def set_X_pending(
         self,
-        X_pending: Optional[Tensor] = None,
-        X_pending_evaluation_mask: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
+        X_pending_evaluation_mask: Tensor | None = None,
     ) -> None:
         r"""Informs the AF about pending design points for different outcomes.
 
@@ -135,7 +134,7 @@ def set_X_pending(
             self.X_pending = X_pending
             self.X_pending_evaluation_mask = X_pending_evaluation_mask
 
-    def construct_evaluation_mask(self, X: Tensor) -> Optional[Tensor]:
+    def construct_evaluation_mask(self, X: Tensor) -> Tensor | None:
         r"""Construct the boolean evaluation mask for X and X_pending
 
         Args:
diff --git a/botorch/acquisition/factory.py b/botorch/acquisition/factory.py
index 7047fede25..5915a01505 100644
--- a/botorch/acquisition/factory.py
+++ b/botorch/acquisition/factory.py
@@ -10,7 +10,8 @@
 
 from __future__ import annotations
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+
 
 import torch
 
@@ -35,21 +36,21 @@ def get_acquisition_function(
     model: Model,
     objective: MCAcquisitionObjective,
     X_observed: Tensor,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_pending: Optional[Tensor] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    eta: Optional[Union[Tensor, float]] = 1e-3,
+    posterior_transform: PosteriorTransform | None = None,
+    X_pending: Tensor | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    eta: Tensor | float | None = 1e-3,
     mc_samples: int = 512,
-    seed: Optional[int] = None,
+    seed: int | None = None,
     *,
     # optional parameters that are only needed for certain acquisition functions
     tau: float = 1e-3,
     prune_baseline: bool = True,
-    marginalize_dim: Optional[int] = None,
+    marginalize_dim: int | None = None,
     cache_root: bool = True,
-    beta: Optional[float] = None,
-    ref_point: Union[None, list[float], Tensor] = None,
-    Y: Optional[Tensor] = None,
+    beta: float | None = None,
+    ref_point: None | list[float] | Tensor = None,
+    Y: Tensor | None = None,
     alpha: float = 0.0,
 ) -> monte_carlo.MCAcquisitionFunction:
     r"""Convenience function for initializing botorch acquisition functions.
diff --git a/botorch/acquisition/fixed_feature.py b/botorch/acquisition/fixed_feature.py
index e0f0fa38ab..379511d586 100644
--- a/botorch/acquisition/fixed_feature.py
+++ b/botorch/acquisition/fixed_feature.py
@@ -14,7 +14,6 @@
 from collections.abc import Sequence
 
 from numbers import Number
-from typing import Optional, Union
 
 import torch
 from botorch.acquisition.acquisition import AcquisitionFunction
@@ -22,7 +21,7 @@
 from torch.nn import Module
 
 
-def get_dtype_of_sequence(values: Sequence[Union[Tensor, float]]) -> torch.dtype:
+def get_dtype_of_sequence(values: Sequence[Tensor | float]) -> torch.dtype:
     """
     Return torch.float32 if everything is single-precision and torch.float64
     otherwise.
@@ -30,21 +29,21 @@ def get_dtype_of_sequence(values: Sequence[Union[Tensor, float]]) -> torch.dtype
     Numbers (non-tensors) are double-precision.
     """
 
-    def _is_single(value: Union[Tensor, float]) -> bool:
+    def _is_single(value: Tensor | float) -> bool:
         return isinstance(value, Tensor) and value.dtype == torch.float32
 
     all_single_precision = all(_is_single(value) for value in values)
     return torch.float32 if all_single_precision else torch.float64
 
 
-def get_device_of_sequence(values: Sequence[Union[Tensor, float]]) -> torch.dtype:
+def get_device_of_sequence(values: Sequence[Tensor | float]) -> torch.dtype:
     """
     CPU if everything is on the CPU; Cuda otherwise.
 
     Numbers (non-tensors) are considered to be on the CPU.
     """
 
-    def _is_cuda(value: Union[Tensor, float]) -> bool:
+    def _is_cuda(value: Tensor | float) -> bool:
         return hasattr(value, "device") and value.device == torch.device("cuda")
 
     any_cuda = any(_is_cuda(value) for value in values)
@@ -68,7 +67,7 @@ def __init__(
         acq_function: AcquisitionFunction,
         d: int,
         columns: list[int],
-        values: Union[Tensor, Sequence[Union[Tensor, float]]],
+        values: Tensor | Sequence[Tensor | float],
     ) -> None:
         r"""Derived Acquisition Function by fixing a subset of input features.
 
@@ -167,7 +166,7 @@ def X_pending(self):
             )
 
     @X_pending.setter
-    def X_pending(self, X_pending: Optional[Tensor]):
+    def X_pending(self, X_pending: Tensor | None):
         r"""Sets the `X_pending` of the base acquisition function."""
         if X_pending is not None:
             self.acq_func.X_pending = self._construct_X_full(X_pending)
diff --git a/botorch/acquisition/input_constructors.py b/botorch/acquisition/input_constructors.py
index f145781da5..ae5a9de0ca 100644
--- a/botorch/acquisition/input_constructors.py
+++ b/botorch/acquisition/input_constructors.py
@@ -12,8 +12,8 @@
 from __future__ import annotations
 
 import inspect
-from collections.abc import Hashable, Iterable, Sequence
-from typing import Any, Callable, List, Optional, TypeVar, Union
+from collections.abc import Callable, Hashable, Iterable, Sequence
+from typing import Any, TypeVar, Union
 
 import torch
 from botorch.acquisition.acquisition import AcquisitionFunction
@@ -130,7 +130,7 @@
 
 
 def _field_is_shared(
-    datasets: Union[Iterable[SupervisedDataset], dict[Hashable, SupervisedDataset]],
+    datasets: Iterable[SupervisedDataset] | dict[Hashable, SupervisedDataset],
     fieldname: str,
 ) -> bool:
     r"""Determines whether or not a given field is shared by all datasets."""
@@ -157,8 +157,8 @@ def _field_is_shared(
 def _get_dataset_field(
     dataset: MaybeDict[SupervisedDataset],
     fieldname: str,
-    transform: Optional[Callable[[BotorchContainer], Any]] = None,
-    join_rule: Optional[Callable[[Sequence[Any]], Any]] = None,
+    transform: Callable[[BotorchContainer], Any] | None = None,
+    join_rule: Callable[[Sequence[Any]], Any] | None = None,
     first_only: bool = False,
     assert_shared: bool = False,
 ) -> Any:
@@ -278,8 +278,8 @@ def _register_acqf_input_constructor(
 @acqf_input_constructor(PosteriorMean)
 def construct_inputs_posterior_mean(
     model: Model,
-    posterior_transform: Optional[PosteriorTransform] = None,
-) -> dict[str, Union[Model, Optional[PosteriorTransform]]]:
+    posterior_transform: PosteriorTransform | None = None,
+) -> dict[str, Model | PosteriorTransform | None]:
     r"""Construct kwargs for PosteriorMean acquisition function.
 
     Args:
@@ -302,8 +302,8 @@ def construct_inputs_posterior_mean(
 def construct_inputs_best_f(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
-    posterior_transform: Optional[PosteriorTransform] = None,
-    best_f: Optional[Union[float, Tensor]] = None,
+    posterior_transform: PosteriorTransform | None = None,
+    best_f: float | Tensor | None = None,
     maximize: bool = True,
 ) -> dict[str, Any]:
     r"""Construct kwargs for the acquisition functions requiring `best_f`.
@@ -337,8 +337,8 @@ def construct_inputs_best_f(
 @acqf_input_constructor(UpperConfidenceBound)
 def construct_inputs_ucb(
     model: Model,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    beta: Union[float, Tensor] = 0.2,
+    posterior_transform: PosteriorTransform | None = None,
+    beta: float | Tensor = 0.2,
     maximize: bool = True,
 ) -> dict[str, Any]:
     r"""Construct kwargs for `UpperConfidenceBound`.
@@ -395,12 +395,12 @@ def construct_inputs_noisy_ei(
 @acqf_input_constructor(qSimpleRegret)
 def construct_inputs_qSimpleRegret(
     model: Model,
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_pending: Optional[Tensor] = None,
-    sampler: Optional[MCSampler] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    X_baseline: Optional[Tensor] = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    X_pending: Tensor | None = None,
+    sampler: MCSampler | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    X_baseline: Tensor | None = None,
 ) -> dict[str, Any]:
     r"""Construct kwargs for qSimpleRegret.
 
@@ -449,13 +449,13 @@ def construct_inputs_qSimpleRegret(
 def construct_inputs_qEI(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_pending: Optional[Tensor] = None,
-    sampler: Optional[MCSampler] = None,
-    best_f: Optional[Union[float, Tensor]] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    eta: Union[Tensor, float] = 1e-3,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    X_pending: Tensor | None = None,
+    sampler: MCSampler | None = None,
+    best_f: float | Tensor | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    eta: Tensor | float = 1e-3,
 ) -> dict[str, Any]:
     r"""Construct kwargs for the `qExpectedImprovement` constructor.
 
@@ -507,13 +507,13 @@ def construct_inputs_qEI(
 def construct_inputs_qLogEI(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_pending: Optional[Tensor] = None,
-    sampler: Optional[MCSampler] = None,
-    best_f: Optional[Union[float, Tensor]] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    eta: Union[Tensor, float] = 1e-3,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    X_pending: Tensor | None = None,
+    sampler: MCSampler | None = None,
+    best_f: float | Tensor | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    eta: Tensor | float = 1e-3,
     fat: bool = True,
     tau_max: float = TAU_MAX,
     tau_relu: float = TAU_RELU,
@@ -571,15 +571,15 @@ def construct_inputs_qLogEI(
 def construct_inputs_qNEI(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_pending: Optional[Tensor] = None,
-    sampler: Optional[MCSampler] = None,
-    X_baseline: Optional[Tensor] = None,
-    prune_baseline: Optional[bool] = True,
-    cache_root: Optional[bool] = True,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    eta: Union[Tensor, float] = 1e-3,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    X_pending: Tensor | None = None,
+    sampler: MCSampler | None = None,
+    X_baseline: Tensor | None = None,
+    prune_baseline: bool | None = True,
+    cache_root: bool | None = True,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    eta: Tensor | float = 1e-3,
 ) -> dict[str, Any]:
     r"""Construct kwargs for the `qNoisyExpectedImprovement` constructor.
 
@@ -637,15 +637,15 @@ def construct_inputs_qNEI(
 def construct_inputs_qLogNEI(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_pending: Optional[Tensor] = None,
-    sampler: Optional[MCSampler] = None,
-    X_baseline: Optional[Tensor] = None,
-    prune_baseline: Optional[bool] = True,
-    cache_root: Optional[bool] = True,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    eta: Union[Tensor, float] = 1e-3,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    X_pending: Tensor | None = None,
+    sampler: MCSampler | None = None,
+    X_baseline: Tensor | None = None,
+    prune_baseline: bool | None = True,
+    cache_root: bool | None = True,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    eta: Tensor | float = 1e-3,
     fat: bool = True,
     tau_max: float = TAU_MAX,
     tau_relu: float = TAU_RELU,
@@ -711,14 +711,14 @@ def construct_inputs_qLogNEI(
 def construct_inputs_qPI(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_pending: Optional[Tensor] = None,
-    sampler: Optional[MCSampler] = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    X_pending: Tensor | None = None,
+    sampler: MCSampler | None = None,
     tau: float = 1e-3,
-    best_f: Optional[Union[float, Tensor]] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    eta: Union[Tensor, float] = 1e-3,
+    best_f: float | Tensor | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    eta: Tensor | float = 1e-3,
 ) -> dict[str, Any]:
     r"""Construct kwargs for the `qProbabilityOfImprovement` constructor.
 
@@ -776,12 +776,12 @@ def construct_inputs_qPI(
 @acqf_input_constructor(qLowerConfidenceBound, qUpperConfidenceBound)
 def construct_inputs_qUCB(
     model: Model,
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_pending: Optional[Tensor] = None,
-    sampler: Optional[MCSampler] = None,
-    X_baseline: Optional[Tensor] = None,
-    constraints: Optional[List[Callable[[Tensor], Tensor]]] = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    X_pending: Tensor | None = None,
+    sampler: MCSampler | None = None,
+    X_baseline: Tensor | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
     beta: float = 0.2,
 ) -> dict[str, Any]:
     r"""Construct kwargs for the `qUpperConfidenceBound` constructor.
@@ -844,10 +844,10 @@ def construct_inputs_EHVI(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
     objective_thresholds: Tensor,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    alpha: Optional[float] = None,
-    Y_pmean: Optional[Tensor] = None,
+    posterior_transform: PosteriorTransform | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    alpha: float | None = None,
+    Y_pmean: Tensor | None = None,
 ) -> dict[str, Any]:
     r"""Construct kwargs for `ExpectedHypervolumeImprovement` constructor."""
     num_objectives = objective_thresholds.shape[0]
@@ -901,11 +901,11 @@ def construct_inputs_qEHVI(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
     objective_thresholds: Tensor,
-    objective: Optional[MCMultiOutputObjective] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    alpha: Optional[float] = None,
-    sampler: Optional[MCSampler] = None,
-    X_pending: Optional[Tensor] = None,
+    objective: MCMultiOutputObjective | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    alpha: float | None = None,
+    sampler: MCSampler | None = None,
+    X_pending: Tensor | None = None,
     eta: float = 1e-3,
     mc_samples: int = 128,
     qmc: bool = True,
@@ -980,12 +980,12 @@ def construct_inputs_qNEHVI(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
     objective_thresholds: Tensor,
-    objective: Optional[MCMultiOutputObjective] = None,
-    X_baseline: Optional[Tensor] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    alpha: Optional[float] = None,
-    sampler: Optional[MCSampler] = None,
-    X_pending: Optional[Tensor] = None,
+    objective: MCMultiOutputObjective | None = None,
+    X_baseline: Tensor | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    alpha: float | None = None,
+    sampler: MCSampler | None = None,
+    X_pending: Tensor | None = None,
     eta: float = 1e-3,
     fat: bool = False,
     mc_samples: int = 128,
@@ -1052,12 +1052,12 @@ def construct_inputs_qLogNEHVI(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
     objective_thresholds: Tensor,
-    objective: Optional[MCMultiOutputObjective] = None,
-    X_baseline: Optional[Tensor] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    alpha: Optional[float] = None,
-    sampler: Optional[MCSampler] = None,
-    X_pending: Optional[Tensor] = None,
+    objective: MCMultiOutputObjective | None = None,
+    X_baseline: Tensor | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    alpha: float | None = None,
+    sampler: MCSampler | None = None,
+    X_pending: Tensor | None = None,
     eta: float = 1e-3,
     fat: bool = True,
     mc_samples: int = 128,
@@ -1103,15 +1103,15 @@ def construct_inputs_qLogNEHVI(
 def construct_inputs_qLogNParEGO(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
-    scalarization_weights: Optional[Tensor] = None,
-    objective: Optional[MCMultiOutputObjective] = None,
-    X_pending: Optional[Tensor] = None,
-    sampler: Optional[MCSampler] = None,
-    X_baseline: Optional[Tensor] = None,
-    prune_baseline: Optional[bool] = True,
-    cache_root: Optional[bool] = True,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    eta: Union[Tensor, float] = 1e-3,
+    scalarization_weights: Tensor | None = None,
+    objective: MCMultiOutputObjective | None = None,
+    X_pending: Tensor | None = None,
+    sampler: MCSampler | None = None,
+    X_baseline: Tensor | None = None,
+    prune_baseline: bool | None = True,
+    cache_root: bool | None = True,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    eta: Tensor | float = 1e-3,
     fat: bool = True,
     tau_max: float = TAU_MAX,
     tau_relu: float = TAU_RELU,
@@ -1182,7 +1182,7 @@ def construct_inputs_qMES(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
     bounds: list[tuple[float, float]],
-    posterior_transform: Optional[PosteriorTransform] = None,
+    posterior_transform: PosteriorTransform | None = None,
     candidate_size: int = 1000,
     maximize: bool = True,
     # TODO: qMES also supports other inputs, such as num_fantasies
@@ -1202,8 +1202,8 @@ def construct_inputs_qMES(
 
 
 def construct_inputs_mf_base(
-    target_fidelities: dict[int, Union[int, float]],
-    fidelity_weights: Optional[dict[int, float]] = None,
+    target_fidelities: dict[int, int | float],
+    fidelity_weights: dict[int, float] | None = None,
     cost_intercept: float = 1.0,
     num_trace_observations: int = 0,
 ) -> dict[str, Any]:
@@ -1242,8 +1242,8 @@ def construct_inputs_qKG(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
     bounds: list[tuple[float, float]],
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
     num_fantasies: int = 64,
     with_current_value: bool = False,
     **optimize_objective_kwargs: TOptimizeObjectiveKwargs,
@@ -1281,8 +1281,8 @@ def construct_inputs_qHVKG(
     training_data: MaybeDict[SupervisedDataset],
     bounds: list[tuple[float, float]],
     objective_thresholds: Tensor,
-    objective: Optional[MCMultiOutputObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
+    objective: MCMultiOutputObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
     num_fantasies: int = 8,
     num_pareto: int = 10,
     **optimize_objective_kwargs: TOptimizeObjectiveKwargs,
@@ -1326,10 +1326,10 @@ def construct_inputs_qMFKG(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
     bounds: list[tuple[float, float]],
-    target_fidelities: dict[int, Union[int, float]],
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    fidelity_weights: Optional[dict[int, float]] = None,
+    target_fidelities: dict[int, int | float],
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    fidelity_weights: dict[int, float] | None = None,
     cost_intercept: float = 1.0,
     num_trace_observations: int = 0,
     num_fantasies: int = 64,
@@ -1372,11 +1372,11 @@ def construct_inputs_qMFHVKG(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
     bounds: list[tuple[float, float]],
-    target_fidelities: dict[int, Union[int, float]],
+    target_fidelities: dict[int, int | float],
     objective_thresholds: Tensor,
-    objective: Optional[MCMultiOutputObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    fidelity_weights: Optional[dict[int, float]] = None,
+    objective: MCMultiOutputObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    fidelity_weights: dict[int, float] | None = None,
     cost_intercept: float = 1.0,
     num_trace_observations: int = 0,
     num_fantasies: int = 8,
@@ -1442,9 +1442,9 @@ def construct_inputs_qMFMES(
     model: Model,
     training_data: MaybeDict[SupervisedDataset],
     bounds: list[tuple[float, float]],
-    target_fidelities: dict[int, Union[int, float]],
+    target_fidelities: dict[int, int | float],
     num_fantasies: int = 64,
-    fidelity_weights: Optional[dict[int, float]] = None,
+    fidelity_weights: dict[int, float] | None = None,
     cost_intercept: float = 1.0,
     num_trace_observations: int = 0,
     candidate_size: int = 1000,
@@ -1472,11 +1472,11 @@ def construct_inputs_qMFMES(
 @acqf_input_constructor(AnalyticExpectedUtilityOfBestOption)
 def construct_inputs_analytic_eubo(
     model: Model,
-    pref_model: Optional[Model] = None,
-    previous_winner: Optional[Tensor] = None,
-    sample_multiplier: Optional[float] = 1.0,
-    objective: Optional[LearnedObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
+    pref_model: Model | None = None,
+    previous_winner: Tensor | None = None,
+    sample_multiplier: float | None = 1.0,
+    objective: LearnedObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
 ) -> dict[str, Any]:
     r"""Construct kwargs for the `AnalyticExpectedUtilityOfBestOption` constructor.
 
@@ -1531,12 +1531,12 @@ def construct_inputs_analytic_eubo(
 @acqf_input_constructor(qExpectedUtilityOfBestOption)
 def construct_inputs_qeubo(
     model: Model,
-    pref_model: Optional[Model] = None,
-    sample_multiplier: Optional[float] = 1.0,
-    sampler: Optional[MCSampler] = None,
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_pending: Optional[Tensor] = None,
+    pref_model: Model | None = None,
+    sample_multiplier: float | None = 1.0,
+    sampler: MCSampler | None = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    X_pending: Tensor | None = None,
 ) -> dict[str, Any]:
     r"""Construct kwargs for the `qExpectedUtilityOfBestOption` (qEUBO) constructor.
 
@@ -1590,7 +1590,7 @@ def construct_inputs_qeubo(
 
 def get_best_f_analytic(
     training_data: MaybeDict[SupervisedDataset],
-    posterior_transform: Optional[PosteriorTransform] = None,
+    posterior_transform: PosteriorTransform | None = None,
 ) -> Tensor:
     if isinstance(training_data, dict) and not _field_is_shared(
         training_data, fieldname="X"
@@ -1615,10 +1615,10 @@ def get_best_f_analytic(
 
 def get_best_f_mc(
     training_data: MaybeDict[SupervisedDataset],
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-    model: Optional[Model] = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
+    model: Model | None = None,
 ) -> Tensor:
     """
     Computes the maximum value of the objective over the training data.
@@ -1692,17 +1692,17 @@ def optimize_objective(
     model: Model,
     bounds: Tensor,
     q: int,
-    acq_function: Optional[AcquisitionFunction] = None,
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    linear_constraints: Optional[tuple[Tensor, Tensor]] = None,
-    fixed_features: Optional[dict[int, float]] = None,
+    acq_function: AcquisitionFunction | None = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    linear_constraints: tuple[Tensor, Tensor] | None = None,
+    fixed_features: dict[int, float] | None = None,
     qmc: bool = True,
     mc_samples: int = 512,
-    seed_inner: Optional[int] = None,
-    optimizer_options: Optional[dict[str, Any]] = None,
-    post_processing_func: Optional[Callable[[Tensor], Tensor]] = None,
-    batch_initial_conditions: Optional[Tensor] = None,
+    seed_inner: int | None = None,
+    optimizer_options: dict[str, Any] | None = None,
+    post_processing_func: Callable[[Tensor], Tensor] | None = None,
+    batch_initial_conditions: Tensor | None = None,
     sequential: bool = False,
 ) -> tuple[Tensor, Tensor]:
     r"""Optimize an objective under the given model.
@@ -1803,7 +1803,7 @@ def construct_inputs_qJES(
     num_optima: int = 64,
     maximize: bool = True,
     condition_noiseless: bool = True,
-    X_pending: Optional[Tensor] = None,
+    X_pending: Tensor | None = None,
     estimation_type: str = "LB",
     num_samples: int = 64,
 ):
@@ -1831,9 +1831,9 @@ def construct_inputs_qJES(
 @acqf_input_constructor(qBayesianActiveLearningByDisagreement)
 def construct_inputs_BALD(
     model: Model,
-    X_pending: Optional[Tensor] = None,
-    sampler: Optional[MCSampler] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
+    X_pending: Tensor | None = None,
+    sampler: MCSampler | None = None,
+    posterior_transform: PosteriorTransform | None = None,
 ):
     inputs = {
         "model": model,
@@ -1849,8 +1849,8 @@ def construct_inputs_NIPV(
     model: Model,
     bounds: list[tuple[float, float]],
     num_mc_points: int = 128,
-    X_pending: Optional[Tensor] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
+    X_pending: Tensor | None = None,
+    posterior_transform: PosteriorTransform | None = None,
 ) -> dict[str, Any]:
     """Construct inputs for qNegIntegratedPosteriorVariance."""
     bounds = torch.as_tensor(bounds).to(model.train_targets).T
@@ -1866,7 +1866,7 @@ def construct_inputs_NIPV(
 
 def _get_ref_point(
     objective_thresholds: Tensor,
-    objective: Optional[MCMultiOutputObjective] = None,
+    objective: MCMultiOutputObjective | None = None,
 ) -> Tensor:
 
     if objective is None:
diff --git a/botorch/acquisition/joint_entropy_search.py b/botorch/acquisition/joint_entropy_search.py
index b1cd01764a..1a856cc6df 100644
--- a/botorch/acquisition/joint_entropy_search.py
+++ b/botorch/acquisition/joint_entropy_search.py
@@ -25,7 +25,6 @@
 import warnings
 from math import log, pi
 
-from typing import Optional
 
 import torch
 from botorch import settings
@@ -73,8 +72,8 @@ def __init__(
         optimal_inputs: Tensor,
         optimal_outputs: Tensor,
         condition_noiseless: bool = True,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
         estimation_type: str = "LB",
         maximize: bool = True,
         num_samples: int = 64,
diff --git a/botorch/acquisition/knowledge_gradient.py b/botorch/acquisition/knowledge_gradient.py
index 66cb8125e9..8e3407f6ea 100644
--- a/botorch/acquisition/knowledge_gradient.py
+++ b/botorch/acquisition/knowledge_gradient.py
@@ -26,8 +26,10 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 from copy import deepcopy
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 from botorch import settings
@@ -67,13 +69,13 @@ class qKnowledgeGradient(MCAcquisitionFunction, OneShotAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        num_fantasies: Optional[int] = 64,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        inner_sampler: Optional[MCSampler] = None,
-        X_pending: Optional[Tensor] = None,
-        current_value: Optional[Tensor] = None,
+        num_fantasies: int | None = 64,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        inner_sampler: MCSampler | None = None,
+        X_pending: Tensor | None = None,
+        current_value: Tensor | None = None,
     ) -> None:
         r"""q-Knowledge Gradient (one-shot optimization).
 
@@ -319,18 +321,18 @@ class qMultiFidelityKnowledgeGradient(qKnowledgeGradient):
     def __init__(
         self,
         model: Model,
-        num_fantasies: Optional[int] = 64,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        inner_sampler: Optional[MCSampler] = None,
-        X_pending: Optional[Tensor] = None,
-        current_value: Optional[Tensor] = None,
-        cost_aware_utility: Optional[CostAwareUtility] = None,
+        num_fantasies: int | None = 64,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        inner_sampler: MCSampler | None = None,
+        X_pending: Tensor | None = None,
+        current_value: Tensor | None = None,
+        cost_aware_utility: CostAwareUtility | None = None,
         project: Callable[[Tensor], Tensor] = lambda X: X,
         expand: Callable[[Tensor], Tensor] = lambda X: X,
-        valfunc_cls: Optional[type[AcquisitionFunction]] = None,
-        valfunc_argfac: Optional[Callable[[Model], dict[str, Any]]] = None,
+        valfunc_cls: type[AcquisitionFunction] | None = None,
+        valfunc_argfac: Callable[[Model], dict[str, Any]] | None = None,
     ) -> None:
         r"""Multi-Fidelity q-Knowledge Gradient (one-shot optimization).
 
@@ -516,12 +518,12 @@ def forward(self, X: Tensor) -> Tensor:
 
 def _get_value_function(
     model: Model,
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    sampler: Optional[MCSampler] = None,
-    project: Optional[Callable[[Tensor], Tensor]] = None,
-    valfunc_cls: Optional[type[AcquisitionFunction]] = None,
-    valfunc_argfac: Optional[Callable[[Model], dict[str, Any]]] = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    sampler: MCSampler | None = None,
+    project: Callable[[Tensor], Tensor] | None = None,
+    valfunc_cls: type[AcquisitionFunction] | None = None,
+    valfunc_argfac: Callable[[Model], dict[str, Any]] | None = None,
 ) -> AcquisitionFunction:
     r"""Construct value function (i.e. inner acquisition function)."""
     if valfunc_cls is not None:
diff --git a/botorch/acquisition/logei.py b/botorch/acquisition/logei.py
index 0c67b201a7..6432dc4eb5 100644
--- a/botorch/acquisition/logei.py
+++ b/botorch/acquisition/logei.py
@@ -17,11 +17,13 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 from copy import deepcopy
 
 from functools import partial
 
-from typing import Callable, Optional, TypeVar, Union
+from typing import TypeVar
 
 import torch
 from botorch.acquisition.cached_cholesky import CachedCholeskyMCSamplerMixin
@@ -78,12 +80,12 @@ class LogImprovementMCAcquisitionFunction(SampleReducingMCAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        eta: Tensor | float = 1e-3,
         fat: bool = True,
         tau_max: float = TAU_MAX,
     ) -> None:
@@ -161,13 +163,13 @@ class qLogExpectedImprovement(LogImprovementMCAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        best_f: float | Tensor,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        eta: Tensor | float = 1e-3,
         fat: bool = True,
         tau_max: float = TAU_MAX,
         tau_relu: float = TAU_RELU,
@@ -262,18 +264,18 @@ def __init__(
         self,
         model: Model,
         X_baseline: Tensor,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        eta: Tensor | float = 1e-3,
         fat: bool = True,
         prune_baseline: bool = False,
         cache_root: bool = True,
         tau_max: float = TAU_MAX,
         tau_relu: float = TAU_RELU,
-        marginalize_dim: Optional[int] = None,
+        marginalize_dim: int | None = None,
     ) -> None:
         r"""q-Noisy Expected Improvement.
 
@@ -365,12 +367,12 @@ def _init_baseline(
         self,
         model: Model,
         X_baseline: Tensor,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
         prune_baseline: bool = False,
         cache_root: bool = True,
-        marginalize_dim: Optional[int] = None,
+        marginalize_dim: int | None = None,
     ) -> None:
         CachedCholeskyMCSamplerMixin.__init__(
             self, model=model, cache_root=cache_root, sampler=sampler
@@ -509,7 +511,7 @@ def _compute_best_feasible_objective(self, samples: Tensor, obj: Tensor) -> Tens
 def _log_improvement(
     Y: Tensor,
     best_f: Tensor,
-    tau: Union[float, Tensor],
+    tau: float | Tensor,
     fat: bool,
 ) -> Tensor:
     """Computes the logarithm of the softplus-smoothed improvement, i.e.
diff --git a/botorch/acquisition/max_value_entropy_search.py b/botorch/acquisition/max_value_entropy_search.py
index 51865f0711..d2d69be7a6 100644
--- a/botorch/acquisition/max_value_entropy_search.py
+++ b/botorch/acquisition/max_value_entropy_search.py
@@ -31,9 +31,9 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from copy import deepcopy
 from math import log
-from typing import Callable, Optional
 
 import numpy as np
 import torch
@@ -71,9 +71,9 @@ def __init__(
         self,
         model: Model,
         num_mv_samples: int,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
         maximize: bool = True,
-        X_pending: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
     ) -> None:
         r"""Single-outcome max-value entropy search-based acquisition functions.
 
@@ -136,7 +136,7 @@ def forward(self, X: Tensor) -> Tensor:
         # Average over fantasies, ig is of shape `num_fantasies x batch_shape x (m)`.
         return ig.mean(dim=0)
 
-    def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
+    def set_X_pending(self, X_pending: Tensor | None = None) -> None:
         r"""Set pending design points.
 
         Set "pending points" to inform the acquisition function of the candidate
@@ -171,7 +171,7 @@ def _compute_information_gain(self, X: Tensor) -> Tensor:
 
     @abstractmethod
     def _sample_max_values(
-        self, num_samples: int, X_pending: Optional[Tensor] = None
+        self, num_samples: int, X_pending: Tensor | None = None
     ) -> None:
         r"""Draw samples from the posterior over maximum values.
 
@@ -204,11 +204,11 @@ def __init__(
         model: Model,
         candidate_set: Tensor,
         num_mv_samples: int = 10,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
         use_gumbel: bool = True,
         maximize: bool = True,
-        X_pending: Optional[Tensor] = None,
-        train_inputs: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
+        train_inputs: Tensor | None = None,
     ) -> None:
         r"""Single-outcome MES-like acquisition functions based on discrete MV sampling.
 
@@ -252,7 +252,7 @@ def __init__(
         )
 
     def _sample_max_values(
-        self, num_samples: int, X_pending: Optional[Tensor] = None
+        self, num_samples: int, X_pending: Tensor | None = None
     ) -> None:
         r"""Draw samples from the posterior over maximum values on a discrete set.
 
@@ -321,11 +321,11 @@ def __init__(
         num_fantasies: int = 16,
         num_mv_samples: int = 10,
         num_y_samples: int = 128,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
         use_gumbel: bool = True,
         maximize: bool = True,
-        X_pending: Optional[Tensor] = None,
-        train_inputs: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
+        train_inputs: Tensor | None = None,
     ) -> None:
         r"""Single-outcome max-value entropy search acquisition function.
 
@@ -370,7 +370,7 @@ def __init__(
         self.num_fantasies = num_fantasies
         self.set_X_pending(X_pending)  # this did not happen in the super constructor
 
-    def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
+    def set_X_pending(self, X_pending: Tensor | None = None) -> None:
         r"""Set pending points.
 
         Informs the acquisition function about pending design points,
@@ -690,11 +690,11 @@ def __init__(
         num_fantasies: int = 16,
         num_mv_samples: int = 10,
         num_y_samples: int = 128,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
         use_gumbel: bool = True,
         maximize: bool = True,
-        X_pending: Optional[Tensor] = None,
-        cost_aware_utility: Optional[CostAwareUtility] = None,
+        X_pending: Tensor | None = None,
+        cost_aware_utility: CostAwareUtility | None = None,
         project: Callable[[Tensor], Tensor] = lambda X: X,
         expand: Callable[[Tensor], Tensor] = lambda X: X,
     ) -> None:
@@ -841,10 +841,10 @@ def __init__(
         num_fantasies: int = 16,
         num_mv_samples: int = 10,
         num_y_samples: int = 128,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
         use_gumbel: bool = True,
         maximize: bool = True,
-        cost_aware_utility: Optional[CostAwareUtility] = None,
+        cost_aware_utility: CostAwareUtility | None = None,
         project: Callable[[Tensor], Tensor] = lambda X: X,
         expand: Callable[[Tensor], Tensor] = lambda X: X,
     ) -> None:
@@ -925,7 +925,7 @@ def _sample_max_value_Thompson(
     model: Model,
     candidate_set: Tensor,
     num_samples: int,
-    posterior_transform: Optional[PosteriorTransform] = None,
+    posterior_transform: PosteriorTransform | None = None,
     maximize: bool = True,
 ) -> Tensor:
     """Samples the max values by discrete Thompson sampling.
@@ -960,7 +960,7 @@ def _sample_max_value_Gumbel(
     model: Model,
     candidate_set: Tensor,
     num_samples: int,
-    posterior_transform: Optional[PosteriorTransform] = None,
+    posterior_transform: PosteriorTransform | None = None,
     maximize: bool = True,
 ) -> Tensor:
     """Samples the max values by Gumbel approximation.
diff --git a/botorch/acquisition/monte_carlo.py b/botorch/acquisition/monte_carlo.py
index b97fb6957e..17cf53fd14 100644
--- a/botorch/acquisition/monte_carlo.py
+++ b/botorch/acquisition/monte_carlo.py
@@ -24,9 +24,10 @@
 
 import math
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from copy import deepcopy
 from functools import partial
-from typing import Callable, Optional, Protocol, Union
+from typing import Protocol
 
 import torch
 from botorch.acquisition.acquisition import AcquisitionFunction, MCSamplerMixin
@@ -63,10 +64,10 @@ class MCAcquisitionFunction(AcquisitionFunction, MCSamplerMixin, ABC):
     def __init__(
         self,
         model: Model,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
     ) -> None:
         r"""
         Args:
@@ -180,14 +181,14 @@ class SampleReducingMCAcquisitionFunction(MCAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
         sample_reduction: SampleReductionProtocol = torch.mean,
         q_reduction: SampleReductionProtocol = torch.amax,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        eta: Tensor | float = 1e-3,
         fat: bool = False,
     ):
         r"""Constructor of SampleReducingMCAcquisitionFunction.
@@ -356,13 +357,13 @@ class qExpectedImprovement(SampleReducingMCAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        best_f: float | Tensor,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        eta: Tensor | float = 1e-3,
     ) -> None:
         r"""q-Expected Improvement.
 
@@ -442,15 +443,15 @@ def __init__(
         self,
         model: Model,
         X_baseline: Tensor,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
         prune_baseline: bool = True,
         cache_root: bool = True,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        eta: Union[Tensor, float] = 1e-3,
-        marginalize_dim: Optional[int] = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        eta: Tensor | float = 1e-3,
+        marginalize_dim: int | None = None,
     ) -> None:
         r"""q-Noisy Expected Improvement.
 
@@ -667,14 +668,14 @@ class qProbabilityOfImprovement(SampleReducingMCAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        best_f: Union[float, Tensor],
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        best_f: float | Tensor,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
         tau: float = 1e-3,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        eta: Tensor | float = 1e-3,
     ) -> None:
         r"""q-Probability of Improvement.
 
@@ -759,10 +760,10 @@ class qSimpleRegret(SampleReducingMCAcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
     ) -> None:
         r"""q-Simple Regret.
 
@@ -829,10 +830,10 @@ def __init__(
         self,
         model: Model,
         beta: float,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
     ) -> None:
         r"""q-Upper Confidence Bound.
 
diff --git a/botorch/acquisition/multi_objective/analytic.py b/botorch/acquisition/multi_objective/analytic.py
index 7e4f00e5d9..dd43e87c20 100644
--- a/botorch/acquisition/multi_objective/analytic.py
+++ b/botorch/acquisition/multi_objective/analytic.py
@@ -20,7 +20,6 @@
 from __future__ import annotations
 
 from itertools import product
-from typing import Optional
 
 import torch
 from botorch.acquisition.multi_objective.base import (
@@ -42,7 +41,7 @@ def __init__(
         model: Model,
         ref_point: list[float],
         partitioning: NondominatedPartitioning,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> None:
         r"""Expected Hypervolume Improvement supporting m>=2 outcomes.
 
diff --git a/botorch/acquisition/multi_objective/base.py b/botorch/acquisition/multi_objective/base.py
index 7d0d72873c..1811e6beed 100644
--- a/botorch/acquisition/multi_objective/base.py
+++ b/botorch/acquisition/multi_objective/base.py
@@ -12,7 +12,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import torch
 from botorch.acquisition.acquisition import AcquisitionFunction, MCSamplerMixin
@@ -34,7 +34,7 @@ class MultiObjectiveAnalyticAcquisitionFunction(AcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> None:
         r"""Constructor for the MultiObjectiveAnalyticAcquisitionFunction base class.
 
@@ -61,7 +61,7 @@ def forward(self, X: Tensor) -> Tensor:
         """
         pass  # pragma: no cover
 
-    def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
+    def set_X_pending(self, X_pending: Tensor | None = None) -> None:
         raise UnsupportedError(
             "Analytic acquisition functions do not account for X_pending yet."
         )
@@ -81,11 +81,11 @@ class MultiObjectiveMCAcquisitionFunction(AcquisitionFunction, MCSamplerMixin, A
     def __init__(
         self,
         model: Model,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        eta: Union[Tensor, float] = 1e-3,
-        X_pending: Optional[Tensor] = None,
+        sampler: MCSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        eta: Tensor | float = 1e-3,
+        X_pending: Tensor | None = None,
     ) -> None:
         r"""Constructor for the `MultiObjectiveMCAcquisitionFunction` base class.
 
diff --git a/botorch/acquisition/multi_objective/hypervolume_knowledge_gradient.py b/botorch/acquisition/multi_objective/hypervolume_knowledge_gradient.py
index df62947074..dd479c681a 100644
--- a/botorch/acquisition/multi_objective/hypervolume_knowledge_gradient.py
+++ b/botorch/acquisition/multi_objective/hypervolume_knowledge_gradient.py
@@ -17,8 +17,9 @@
 """
 
 import warnings
+from collections.abc import Callable
 from copy import deepcopy
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 from botorch import settings
@@ -77,15 +78,15 @@ def __init__(
         ref_point: Tensor,
         num_fantasies: int = 8,
         num_pareto: int = 10,
-        sampler: Optional[ListSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        inner_sampler: Optional[MCSampler] = None,
-        X_evaluation_mask: Optional[list[Tensor]] = None,
-        X_pending: Optional[Tensor] = None,
-        X_pending_evaluation_mask: Optional[Tensor] = None,
-        current_value: Optional[Tensor] = None,
+        sampler: ListSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        inner_sampler: MCSampler | None = None,
+        X_evaluation_mask: list[Tensor] | None = None,
+        X_pending: Tensor | None = None,
+        X_pending_evaluation_mask: Tensor | None = None,
+        current_value: Tensor | None = None,
         use_posterior_mean: bool = True,
-        cost_aware_utility: Optional[CostAwareUtility] = None,
+        cost_aware_utility: CostAwareUtility | None = None,
     ) -> None:
         r"""q-Hypervolume Knowledge Gradient.
 
@@ -311,17 +312,17 @@ def __init__(
         target_fidelities: dict[int, float],
         num_fantasies: int = 8,
         num_pareto: int = 10,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        inner_sampler: Optional[MCSampler] = None,
-        X_pending: Optional[Tensor] = None,
-        X_evaluation_mask: Optional[Tensor] = None,
-        X_pending_evaluation_mask: Optional[Tensor] = None,
-        current_value: Optional[Tensor] = None,
-        cost_aware_utility: Optional[CostAwareUtility] = None,
+        sampler: MCSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        inner_sampler: MCSampler | None = None,
+        X_pending: Tensor | None = None,
+        X_evaluation_mask: Tensor | None = None,
+        X_pending_evaluation_mask: Tensor | None = None,
+        current_value: Tensor | None = None,
+        cost_aware_utility: CostAwareUtility | None = None,
         project: Callable[[Tensor], Tensor] = lambda X: X,
-        valfunc_cls: Optional[type[AcquisitionFunction]] = None,
-        valfunc_argfac: Optional[Callable[[Model], dict[str, Any]]] = None,
+        valfunc_cls: type[AcquisitionFunction] | None = None,
+        valfunc_argfac: Callable[[Model], dict[str, Any]] | None = None,
         use_posterior_mean: bool = True,
         **kwargs: Any,
     ) -> None:
@@ -492,11 +493,11 @@ def forward(self, X: Tensor) -> Tensor:
 def _get_hv_value_function(
     model: Model,
     ref_point: Tensor,
-    objective: Optional[MCMultiOutputObjective] = None,
-    sampler: Optional[MCSampler] = None,
-    project: Optional[Callable[[Tensor], Tensor]] = None,
-    valfunc_cls: Optional[type[AcquisitionFunction]] = None,
-    valfunc_argfac: Optional[Callable[[Model], dict[str, Any]]] = None,
+    objective: MCMultiOutputObjective | None = None,
+    sampler: MCSampler | None = None,
+    project: Callable[[Tensor], Tensor] | None = None,
+    valfunc_cls: type[AcquisitionFunction] | None = None,
+    valfunc_argfac: Callable[[Model], dict[str, Any]] | None = None,
     use_posterior_mean: bool = False,
 ) -> AcquisitionFunction:
     r"""Construct value function (i.e. inner acquisition function).
diff --git a/botorch/acquisition/multi_objective/joint_entropy_search.py b/botorch/acquisition/multi_objective/joint_entropy_search.py
index 199a1dce32..ba13c89cad 100644
--- a/botorch/acquisition/multi_objective/joint_entropy_search.py
+++ b/botorch/acquisition/multi_objective/joint_entropy_search.py
@@ -19,7 +19,6 @@
 
 from abc import abstractmethod
 from math import pi
-from typing import Optional, Union
 
 import torch
 from botorch import settings
@@ -46,7 +45,7 @@ def __init__(
         pareto_sets: Tensor,
         pareto_fronts: Tensor,
         hypercell_bounds: Tensor,
-        X_pending: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
         estimation_type: str = "LB",
         num_samples: int = 64,
     ) -> None:
@@ -124,7 +123,7 @@ def __init__(
     @abstractmethod
     def _compute_posterior_statistics(
         self, X: Tensor
-    ) -> dict[str, Union[GPyTorchPosterior, Tensor]]:
+    ) -> dict[str, GPyTorchPosterior | Tensor]:
         r"""Compute the posterior statistics.
 
         Args:
@@ -278,7 +277,7 @@ def __init__(
         pareto_sets: Tensor,
         pareto_fronts: Tensor,
         hypercell_bounds: Tensor,
-        X_pending: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
         estimation_type: str = "LB",
         num_samples: int = 64,
     ) -> None:
@@ -331,7 +330,7 @@ def __init__(
 
     def _compute_posterior_statistics(
         self, X: Tensor
-    ) -> dict[str, Union[Tensor, GPyTorchPosterior]]:
+    ) -> dict[str, Tensor | GPyTorchPosterior]:
         r"""Compute the posterior statistics.
         Args:
             X: A `batch_shape x q x d`-dim Tensor of inputs.
diff --git a/botorch/acquisition/multi_objective/logei.py b/botorch/acquisition/multi_objective/logei.py
index f9e6896773..ba56662d3c 100644
--- a/botorch/acquisition/multi_objective/logei.py
+++ b/botorch/acquisition/multi_objective/logei.py
@@ -10,7 +10,8 @@
 
 from __future__ import annotations
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+
 
 import torch
 from botorch.acquisition.logei import TAU_MAX, TAU_RELU
@@ -54,13 +55,13 @@ class qLogExpectedHypervolumeImprovement(
     def __init__(
         self,
         model: Model,
-        ref_point: Union[list[float], Tensor],
+        ref_point: list[float] | Tensor,
         partitioning: NondominatedPartitioning,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        X_pending: Optional[Tensor] = None,
-        eta: Union[Tensor, float] = 1e-2,
+        sampler: MCSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        X_pending: Tensor | None = None,
+        eta: Tensor | float = 1e-2,
         fat: bool = True,
         tau_relu: float = TAU_RELU,
         tau_max: float = TAU_MAX,
@@ -144,7 +145,7 @@ def __init__(
         self.tau_max = tau_max
         self.fat = fat
 
-    def _compute_log_qehvi(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def _compute_log_qehvi(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Compute the expected (feasible) hypervolume improvement given MC samples.
 
         Args:
@@ -267,7 +268,7 @@ def _compute_log_qehvi(self, samples: Tensor, X: Optional[Tensor] = None) -> Ten
         return logmeanexp(logsumexp(log_areas_per_segment, dim=-1), dim=0)
 
     def _log_improvement(
-        self, obj_subsets: Tensor, view_shape: Union[tuple, torch.Size]
+        self, obj_subsets: Tensor, view_shape: tuple | torch.Size
     ) -> Tensor:
         # smooth out the clamp and take the log (previous step 3)
         # subtract cell lower bounds, clamp min at zero, but first
@@ -282,7 +283,7 @@ def _log_improvement(
         return log_Zi  # mc_samples x batch_shape x num_cells x q_choose_i x i x m
 
     def _log_cell_lengths(
-        self, log_improvement_i: Tensor, view_shape: Union[tuple, torch.Size]
+        self, log_improvement_i: Tensor, view_shape: tuple | torch.Size
     ) -> Tensor:
         cell_upper_bounds = self.cell_upper_bounds.clamp_max(
             1e10 if log_improvement_i.dtype == torch.double else 1e8
@@ -327,13 +328,13 @@ class qLogNoisyExpectedHypervolumeImprovement(
     def __init__(
         self,
         model: Model,
-        ref_point: Union[list[float], Tensor],
+        ref_point: list[float] | Tensor,
         X_baseline: Tensor,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        X_pending: Optional[Tensor] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        sampler: MCSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        X_pending: Tensor | None = None,
+        eta: Tensor | float = 1e-3,
         prune_baseline: bool = False,
         alpha: float = 0.0,
         cache_pending: bool = True,
@@ -343,7 +344,7 @@ def __init__(
         tau_relu: float = TAU_RELU,
         tau_max: float = 1e-3,  # TAU_MAX,
         fat: bool = True,
-        marginalize_dim: Optional[int] = None,
+        marginalize_dim: int | None = None,
     ) -> None:
         r"""
         q-Log Noisy Expected Hypervolume Improvement supporting m>=2 outcomes.
diff --git a/botorch/acquisition/multi_objective/max_value_entropy_search.py b/botorch/acquisition/multi_objective/max_value_entropy_search.py
index 5df13ced74..f9e0114b86 100644
--- a/botorch/acquisition/multi_objective/max_value_entropy_search.py
+++ b/botorch/acquisition/multi_objective/max_value_entropy_search.py
@@ -19,8 +19,9 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 from math import pi
-from typing import Callable, Optional, Union
 
 import torch
 from botorch.acquisition.max_value_entropy_search import qMaxValueEntropy
@@ -73,8 +74,8 @@ def __init__(
         model: Model,
         sample_pareto_frontiers: Callable[[Model], Tensor],
         num_fantasies: int = 16,
-        X_pending: Optional[Tensor] = None,
-        sampler: Optional[MCSampler] = None,
+        X_pending: Tensor | None = None,
+        sampler: MCSampler | None = None,
     ) -> None:
         r"""Multi-objective max-value entropy search acquisition function.
 
@@ -119,7 +120,7 @@ def __init__(
         # This avoids attribute errors in qMaxValueEntropy code.
         self.posterior_transform = None
 
-    def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
+    def set_X_pending(self, X_pending: Tensor | None = None) -> None:
         r"""Set pending points.
 
         Informs the acquisition function about pending design points,
@@ -207,7 +208,7 @@ def __init__(
         self,
         model: Model,
         hypercell_bounds: Tensor,
-        X_pending: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
         estimation_type: str = "LB",
         num_samples: int = 64,
     ) -> None:
@@ -240,7 +241,7 @@ def __init__(
 
     def _compute_posterior_statistics(
         self, X: Tensor
-    ) -> dict[str, Union[GPyTorchPosterior, Tensor]]:
+    ) -> dict[str, GPyTorchPosterior | Tensor]:
         r"""Compute the posterior statistics.
 
         Args:
diff --git a/botorch/acquisition/multi_objective/monte_carlo.py b/botorch/acquisition/multi_objective/monte_carlo.py
index f0773f36db..5a78078588 100644
--- a/botorch/acquisition/multi_objective/monte_carlo.py
+++ b/botorch/acquisition/multi_objective/monte_carlo.py
@@ -26,7 +26,8 @@
 
 from __future__ import annotations
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+
 
 import torch
 from botorch.acquisition.multi_objective.base import MultiObjectiveMCAcquisitionFunction
@@ -57,13 +58,13 @@ class qExpectedHypervolumeImprovement(
     def __init__(
         self,
         model: Model,
-        ref_point: Union[list[float], Tensor],
+        ref_point: list[float] | Tensor,
         partitioning: NondominatedPartitioning,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        X_pending: Optional[Tensor] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        sampler: MCSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        X_pending: Tensor | None = None,
+        eta: Tensor | float = 1e-3,
         fat: bool = False,
     ) -> None:
         r"""q-Expected Hypervolume Improvement supporting m>=2 outcomes.
@@ -135,7 +136,7 @@ def __init__(
         SubsetIndexCachingMixin.__init__(self)
         self.fat = fat
 
-    def _compute_qehvi(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def _compute_qehvi(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Compute the expected (feasible) hypervolume improvement given MC samples.
 
         Args:
@@ -236,13 +237,13 @@ class qNoisyExpectedHypervolumeImprovement(
     def __init__(
         self,
         model: Model,
-        ref_point: Union[list[float], Tensor],
+        ref_point: list[float] | Tensor,
         X_baseline: Tensor,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        X_pending: Optional[Tensor] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        sampler: MCSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        X_pending: Tensor | None = None,
+        eta: Tensor | float = 1e-3,
         fat: bool = False,
         prune_baseline: bool = False,
         alpha: float = 0.0,
@@ -250,7 +251,7 @@ def __init__(
         max_iep: int = 0,
         incremental_nehvi: bool = True,
         cache_root: bool = True,
-        marginalize_dim: Optional[int] = None,
+        marginalize_dim: int | None = None,
     ) -> None:
         r"""q-Noisy Expected Hypervolume Improvement supporting m>=2 outcomes.
 
diff --git a/botorch/acquisition/multi_objective/multi_fidelity.py b/botorch/acquisition/multi_objective/multi_fidelity.py
index 9aec38ca01..0a681717b2 100644
--- a/botorch/acquisition/multi_objective/multi_fidelity.py
+++ b/botorch/acquisition/multi_objective/multi_fidelity.py
@@ -18,7 +18,8 @@
 
 from __future__ import annotations
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+
 
 import torch
 from botorch.acquisition.cost_aware import InverseCostWeightedUtility
@@ -41,14 +42,14 @@ class MOMF(qExpectedHypervolumeImprovement):
     def __init__(
         self,
         model: Model,
-        ref_point: Union[list[float], Tensor],
+        ref_point: list[float] | Tensor,
         partitioning: NondominatedPartitioning,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        eta: Union[Tensor, float] = 1e-3,
-        X_pending: Optional[Tensor] = None,
-        cost_call: Optional[Callable[[Tensor], Tensor]] = None,
+        sampler: MCSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        eta: Tensor | float = 1e-3,
+        X_pending: Tensor | None = None,
+        cost_call: Callable[[Tensor], Tensor] | None = None,
     ) -> None:
         r"""MOMF acquisition function supporting m>=2 outcomes.
         The model needs to have train_obj that has a fidelity
diff --git a/botorch/acquisition/multi_objective/multi_output_risk_measures.py b/botorch/acquisition/multi_objective/multi_output_risk_measures.py
index a98dddea55..aa483ee9c5 100644
--- a/botorch/acquisition/multi_objective/multi_output_risk_measures.py
+++ b/botorch/acquisition/multi_objective/multi_output_risk_measures.py
@@ -28,8 +28,8 @@
 
 import warnings
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from math import ceil
-from typing import Callable, Optional, Union
 
 import torch
 from botorch.acquisition.multi_objective.objective import (
@@ -60,7 +60,7 @@ class MultiOutputRiskMeasureMCObjective(
     def __init__(
         self,
         n_w: int,
-        preprocessing_function: Optional[Callable[[Tensor], Tensor]] = None,
+        preprocessing_function: Callable[[Tensor], Tensor] | None = None,
     ) -> None:
         r"""Transform the posterior samples to samples of a risk measure.
 
@@ -92,7 +92,7 @@ def _prepare_samples(self, samples: Tensor) -> Tensor:
         return samples.view(*samples.shape[:-2], -1, self.n_w, samples.shape[-1])
 
     @abstractmethod
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the risk measure corresponding to the given samples.
 
         Args:
@@ -116,7 +116,7 @@ class MultiOutputExpectation(MultiOutputRiskMeasureMCObjective):
     reducing the cost of posterior sampling as a result.
     """
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the expectation of the given samples. Expectation is
         calculated over each `n_w` samples in the q-batch dimension.
 
@@ -166,7 +166,7 @@ def _get_sorted_prepared_samples(self, samples: Tensor) -> Tensor:
         prepared_samples = self._prepare_samples(samples)
         return prepared_samples.sort(dim=-2, descending=True).values
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the CVaR corresponding to the given samples.
 
         Args:
@@ -194,7 +194,7 @@ class IndependentVaR(IndependentCVaR):
     `1 - alpha` quantile of a given random variable.
     """
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the VaR corresponding to the given samples.
 
         Args:
@@ -213,7 +213,7 @@ def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
 class MultiOutputWorstCase(MultiOutputRiskMeasureMCObjective):
     r"""The multi-output worst-case risk measure."""
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the worst-case measure corresponding to the given samples.
 
         Args:
@@ -249,7 +249,7 @@ def __init__(
         n_w: int,
         alpha: float,
         expectation: bool = False,
-        preprocessing_function: Optional[Callable[[Tensor], Tensor]] = None,
+        preprocessing_function: Callable[[Tensor], Tensor] | None = None,
         *,
         pad_to_n_w: bool = False,
         filter_dominated: bool = True,
@@ -483,7 +483,7 @@ def make_differentiable(self, prepared_samples: Tensor, mvars: Tensor) -> Tensor
     def forward(
         self,
         samples: Tensor,
-        X: Optional[Tensor] = None,
+        X: Tensor | None = None,
     ) -> Tensor:
         r"""Calculate the MVaR corresponding to the given samples.
 
@@ -549,10 +549,10 @@ def __init__(
         self,
         alpha: float,
         n_w: int,
-        chebyshev_weights: Union[Tensor, list[float]],
-        baseline_Y: Optional[Tensor] = None,
-        ref_point: Optional[Union[Tensor, list[float]]] = None,
-        preprocessing_function: Optional[Callable[[Tensor], Tensor]] = None,
+        chebyshev_weights: Tensor | list[float],
+        baseline_Y: Tensor | None = None,
+        ref_point: Tensor | list[float] | None = None,
+        preprocessing_function: Callable[[Tensor], Tensor] | None = None,
     ) -> None:
         r"""Transform the posterior samples to samples of a risk measure.
 
@@ -591,9 +591,9 @@ def __init__(
 
     def set_baseline_Y(
         self,
-        model: Optional[Model],
-        X_baseline: Optional[Tensor],
-        Y_samples: Optional[Tensor] = None,
+        model: Model | None,
+        X_baseline: Tensor | None,
+        Y_samples: Tensor | None = None,
     ) -> None:
         r"""Set the `baseline_Y` based on the MVaR predictions of the `model`
         for `X_baseline`.
@@ -629,7 +629,7 @@ def chebyshev_weights(self) -> Tensor:
         return self._chebyshev_weights
 
     @chebyshev_weights.setter
-    def chebyshev_weights(self, chebyshev_weights: Union[Tensor, list[float]]) -> None:
+    def chebyshev_weights(self, chebyshev_weights: Tensor | list[float]) -> None:
         r"""Update the Chebyshev weights.
 
         Invalidates the cached Chebyshev objective.
@@ -649,12 +649,12 @@ def chebyshev_weights(self, chebyshev_weights: Union[Tensor, list[float]]) -> No
         self.register_buffer("_chebyshev_weights", chebyshev_weights)
 
     @property
-    def baseline_Y(self) -> Optional[Tensor]:
+    def baseline_Y(self) -> Tensor | None:
         r"""Baseline outcomes used in determining the normalization bounds."""
         return self._baseline_Y
 
     @baseline_Y.setter
-    def baseline_Y(self, baseline_Y: Optional[Tensor]) -> None:
+    def baseline_Y(self, baseline_Y: Tensor | None) -> None:
         r"""Update the baseline outcomes.
 
         Invalidates the cached Chebyshev objective.
@@ -668,7 +668,7 @@ def baseline_Y(self, baseline_Y: Optional[Tensor]) -> None:
         self.register_buffer("_baseline_Y", baseline_Y)
 
     @property
-    def chebyshev_objective(self) -> Callable[[Tensor, Optional[Tensor]], Tensor]:
+    def chebyshev_objective(self) -> Callable[[Tensor, Tensor | None], Tensor]:
         r"""The objective for applying the Chebyshev scalarization."""
         if self._chebyshev_objective is None:
             self._construct_chebyshev_objective()
@@ -695,7 +695,7 @@ def _construct_chebyshev_objective(self) -> None:
         if ref_point is not None:
             ref_point = normalize(ref_point.unsqueeze(0), bounds=Y_bounds).squeeze(0)
 
-        def chebyshev_obj(Y: Tensor, X: Optional[Tensor] = None) -> Tensor:
+        def chebyshev_obj(Y: Tensor, X: Tensor | None = None) -> Tensor:
             Y = self.preprocessing_function(Y)
             Y = normalize(Y, bounds=Y_bounds)
             if ref_point is not None:
@@ -723,7 +723,7 @@ def _prepare_samples(self, samples: Tensor) -> Tensor:
     @staticmethod
     def _get_Y_normalization_bounds(
         Y: Tensor,
-        ref_point: Optional[Tensor] = None,
+        ref_point: Tensor | None = None,
     ) -> Tensor:
         r"""Get normalization bounds for scalarizations.
 
diff --git a/botorch/acquisition/multi_objective/objective.py b/botorch/acquisition/multi_objective/objective.py
index c3cf0c44ec..065990af61 100644
--- a/botorch/acquisition/multi_objective/objective.py
+++ b/botorch/acquisition/multi_objective/objective.py
@@ -7,7 +7,6 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import Optional
 
 import torch
 from botorch.acquisition.objective import GenericMCObjective, MCAcquisitionObjective
@@ -28,7 +27,7 @@ class MCMultiOutputObjective(MCAcquisitionObjective):
     _is_mo: bool = True
 
     @abstractmethod
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Evaluate the multi-output objective on the samples.
 
         Args:
@@ -72,7 +71,7 @@ class IdentityMCMultiOutputObjective(MCMultiOutputObjective):
     """
 
     def __init__(
-        self, outcomes: Optional[list[int]] = None, num_outcomes: Optional[int] = None
+        self, outcomes: list[int] | None = None, num_outcomes: int | None = None
     ) -> None:
         r"""Initialize Objective.
 
@@ -95,7 +94,7 @@ def __init__(
                 outcomes = normalize_indices(outcomes, num_outcomes)
             self.register_buffer("outcomes", torch.tensor(outcomes, dtype=torch.long))
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         if hasattr(self, "outcomes"):
             return samples.index_select(-1, self.outcomes.to(device=samples.device))
         return samples
@@ -114,8 +113,8 @@ class WeightedMCMultiOutputObjective(IdentityMCMultiOutputObjective):
     def __init__(
         self,
         weights: Tensor,
-        outcomes: Optional[list[int]] = None,
-        num_outcomes: Optional[int] = None,
+        outcomes: list[int] | None = None,
+        num_outcomes: int | None = None,
     ) -> None:
         r"""Initialize Objective.
 
@@ -137,7 +136,7 @@ def __init__(
             )
         self.register_buffer("weights", weights)
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         samples = super().forward(samples=samples)
         return samples * self.weights.to(samples)
 
@@ -148,7 +147,7 @@ def __init__(
         model: Model,
         X_baseline: Tensor,
         constraint_idcs: list[int],
-        objective: Optional[MCMultiOutputObjective] = None,
+        objective: MCMultiOutputObjective | None = None,
     ) -> None:
         r"""Construct a feasibility-weighted objective.
 
@@ -185,9 +184,7 @@ def __init__(
                 X=X_baseline, model=model, objective=lambda y, X: y
             )[objective_idcs]
 
-            def apply_feasibility_weights(
-                Y: Tensor, X: Optional[Tensor] = None
-            ) -> Tensor:
+            def apply_feasibility_weights(Y: Tensor, X: Tensor | None = None) -> Tensor:
                 return apply_constraints(
                     obj=Y[..., objective_idcs],
                     constraints=[lambda Y: -Y[..., i] for i in constraint_idcs],
@@ -205,5 +202,5 @@ def apply_feasibility_weights(
             self.objective = objective
             self._verify_output_shape = objective._verify_output_shape
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         return self.objective(self.apply_feasibility_weights(samples), X=X)
diff --git a/botorch/acquisition/multi_objective/parego.py b/botorch/acquisition/multi_objective/parego.py
index cddf3137d9..da8ea0b066 100644
--- a/botorch/acquisition/multi_objective/parego.py
+++ b/botorch/acquisition/multi_objective/parego.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import torch
 from botorch.acquisition.logei import qLogNoisyExpectedImprovement, TAU_MAX, TAU_RELU
@@ -24,12 +24,12 @@ def __init__(
         self,
         model: Model,
         X_baseline: Tensor,
-        scalarization_weights: Optional[Tensor] = None,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        X_pending: Optional[Tensor] = None,
-        eta: Union[Tensor, float] = 1e-3,
+        scalarization_weights: Tensor | None = None,
+        sampler: MCSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        X_pending: Tensor | None = None,
+        eta: Tensor | float = 1e-3,
         fat: bool = True,
         prune_baseline: bool = False,
         cache_root: bool = True,
@@ -138,7 +138,7 @@ def __init__(
         # Set these after __init__ calls so that they're not overwritten / deleted.
         # These are intended mainly for easier debugging & transparency.
         self._org_objective: MCMultiOutputObjective = org_objective
-        self.chebyshev_scalarization: Callable[[Tensor, Optional[Tensor]], Tensor] = (
+        self.chebyshev_scalarization: Callable[[Tensor, Tensor | None], Tensor] = (
             chebyshev_scalarization
         )
         self.scalarization_weights: Tensor = scalarization_weights
diff --git a/botorch/acquisition/multi_objective/predictive_entropy_search.py b/botorch/acquisition/multi_objective/predictive_entropy_search.py
index 6157ecab7e..c4e9f1a391 100644
--- a/botorch/acquisition/multi_objective/predictive_entropy_search.py
+++ b/botorch/acquisition/multi_objective/predictive_entropy_search.py
@@ -23,7 +23,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from botorch.acquisition.acquisition import AcquisitionFunction
@@ -102,7 +101,7 @@ def __init__(
         model: Model,
         pareto_sets: Tensor,
         maximize: bool = True,
-        X_pending: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
         max_ep_iterations: int = 250,
         ep_jitter: float = 1e-4,
         test_jitter: float = 1e-4,
diff --git a/botorch/acquisition/multi_objective/utils.py b/botorch/acquisition/multi_objective/utils.py
index 30448b587b..9f1c87808b 100644
--- a/botorch/acquisition/multi_objective/utils.py
+++ b/botorch/acquisition/multi_objective/utils.py
@@ -12,8 +12,9 @@
 
 import math
 import warnings
+from collections.abc import Callable
 from math import ceil
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 from botorch.acquisition import monte_carlo  # noqa F401
@@ -68,11 +69,11 @@ def prune_inferior_points_multi_objective(
     model: Model,
     X: Tensor,
     ref_point: Tensor,
-    objective: Optional[MCMultiOutputObjective] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
+    objective: MCMultiOutputObjective | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
     num_samples: int = 2048,
     max_frac: float = 1.0,
-    marginalize_dim: Optional[int] = None,
+    marginalize_dim: int | None = None,
 ) -> Tensor:
     r"""Prune points from an input tensor that are unlikely to be pareto optimal.
 
@@ -165,7 +166,7 @@ def compute_sample_box_decomposition(
     pareto_fronts: Tensor,
     partitioning: BoxDecomposition = DominatedPartitioning,
     maximize: bool = True,
-    num_constraints: Optional[int] = 0,
+    num_constraints: int | None = 0,
 ) -> Tensor:
     r"""Computes the box decomposition associated with some sampled optimal
     objectives. This also supports the single-objective and constrained optimization
@@ -321,7 +322,7 @@ def sample_optimal_points(
         [GenericDeterministicModel, Tensor, int, bool, Any], tuple[Tensor, Tensor]
     ] = random_search_optimizer,
     maximize: bool = True,
-    optimizer_kwargs: Optional[dict[str, Any]] = None,
+    optimizer_kwargs: dict[str, Any] | None = None,
 ) -> tuple[Tensor, Tensor]:
     r"""Compute a collection of optimal inputs and outputs from samples of a Gaussian
     Process (GP).
diff --git a/botorch/acquisition/multi_step_lookahead.py b/botorch/acquisition/multi_step_lookahead.py
index 930878571f..79208b8596 100644
--- a/botorch/acquisition/multi_step_lookahead.py
+++ b/botorch/acquisition/multi_step_lookahead.py
@@ -19,7 +19,8 @@
 
 import math
 import warnings
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any
 
 import numpy as np
 import torch
@@ -53,14 +54,14 @@ def __init__(
         self,
         model: Model,
         batch_sizes: list[int],
-        num_fantasies: Optional[list[int]] = None,
-        samplers: Optional[list[MCSampler]] = None,
-        valfunc_cls: Optional[list[Optional[type[AcquisitionFunction]]]] = None,
-        valfunc_argfacs: Optional[list[Optional[TAcqfArgConstructor]]] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        inner_mc_samples: Optional[list[int]] = None,
-        X_pending: Optional[Tensor] = None,
+        num_fantasies: list[int] | None = None,
+        samplers: list[MCSampler] | None = None,
+        valfunc_cls: list[type[AcquisitionFunction] | None] | None = None,
+        valfunc_argfacs: list[TAcqfArgConstructor | None] | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        inner_mc_samples: list[int] | None = None,
+        X_pending: Tensor | None = None,
         collapse_fantasy_base_samples: bool = True,
     ) -> None:
         r"""q-Multi-Step Look-Ahead (one-shot optimization).
@@ -317,14 +318,14 @@ def get_induced_fantasy_model(self, X: Tensor) -> Model:
 def _step(
     model: Model,
     Xs: list[Tensor],
-    samplers: list[Optional[MCSampler]],
-    valfunc_cls: list[Optional[type[AcquisitionFunction]]],
-    valfunc_argfacs: list[Optional[TAcqfArgConstructor]],
-    inner_samplers: list[Optional[MCSampler]],
+    samplers: list[MCSampler | None],
+    valfunc_cls: list[type[AcquisitionFunction] | None],
+    valfunc_argfacs: list[TAcqfArgConstructor | None],
+    inner_samplers: list[MCSampler | None],
     objective: MCAcquisitionObjective,
-    posterior_transform: Optional[PosteriorTransform],
-    running_val: Optional[Tensor] = None,
-    sample_weights: Optional[Tensor] = None,
+    posterior_transform: PosteriorTransform | None,
+    running_val: Tensor | None = None,
+    sample_weights: Tensor | None = None,
     step_index: int = 0,
 ) -> Tensor:
     r"""Recursive multi-step look-ahead computation.
@@ -424,13 +425,13 @@ def _step(
 
 def _compute_stage_value(
     model: Model,
-    valfunc_cls: Optional[type[AcquisitionFunction]],
+    valfunc_cls: type[AcquisitionFunction] | None,
     X: Tensor,
     objective: MCAcquisitionObjective,
-    posterior_transform: Optional[PosteriorTransform],
-    inner_sampler: Optional[MCSampler] = None,
-    arg_fac: Optional[TAcqfArgConstructor] = None,
-) -> Optional[Tensor]:
+    posterior_transform: PosteriorTransform | None,
+    inner_sampler: MCSampler | None = None,
+    arg_fac: TAcqfArgConstructor | None = None,
+) -> Tensor | None:
     r"""Compute the stage value of a multi-step look-ahead policy.
 
     Args:
@@ -471,7 +472,7 @@ def _compute_stage_value(
 
 def _construct_sample_weights(
     prev_weights: Tensor, sampler: MCSampler
-) -> Optional[Tensor]:
+) -> Tensor | None:
     r"""Iteratively construct tensor of sample weights for multi-step look-ahead.
 
     Args:
@@ -501,10 +502,10 @@ def _construct_sample_weights(
 
 def _construct_inner_samplers(
     batch_sizes: list[int],
-    valfunc_cls: list[Optional[type[AcquisitionFunction]]],
-    inner_mc_samples: list[Optional[int]],
-    objective: Optional[MCAcquisitionObjective] = None,
-) -> list[Optional[MCSampler]]:
+    valfunc_cls: list[type[AcquisitionFunction] | None],
+    inner_mc_samples: list[int | None],
+    objective: MCAcquisitionObjective | None = None,
+) -> list[MCSampler | None]:
     r"""Check validity of inputs and construct inner samplers.
 
     Helper function to be used internally for constructing inner samplers.
@@ -562,7 +563,7 @@ def _construct_inner_samplers(
 
 
 def _get_induced_fantasy_model(
-    model: Model, Xs: list[Tensor], samplers: list[Optional[MCSampler]]
+    model: Model, Xs: list[Tensor], samplers: list[MCSampler | None]
 ) -> Model:
     r"""Recursive computation of the fantasy model induced by an input tree.
 
diff --git a/botorch/acquisition/objective.py b/botorch/acquisition/objective.py
index 05d8b7d4f8..7596b44a8e 100644
--- a/botorch/acquisition/objective.py
+++ b/botorch/acquisition/objective.py
@@ -10,7 +10,8 @@
 
 import warnings
 from abc import ABC, abstractmethod
-from typing import Callable, Optional, TYPE_CHECKING, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING
 
 import torch
 from botorch.exceptions.errors import UnsupportedError
@@ -107,7 +108,7 @@ def evaluate(self, Y: Tensor) -> Tensor:
         return self.offset + Y @ self.weights
 
     def forward(
-        self, posterior: Union[GPyTorchPosterior, PosteriorList]
+        self, posterior: GPyTorchPosterior | PosteriorList
     ) -> GPyTorchPosterior:
         r"""Compute the posterior of the affine transformation.
 
@@ -137,7 +138,7 @@ class ExpectationPosteriorTransform(PosteriorTransform):
     this operates over the q-batch dimension.
     """
 
-    def __init__(self, n_w: int, weights: Optional[Tensor] = None) -> None:
+    def __init__(self, n_w: int, weights: Tensor | None = None) -> None:
         r"""A posterior transform calculating the expectation over the q-batch
         dimension.
 
@@ -242,7 +243,7 @@ class MCAcquisitionObjective(Module, ABC):
     _is_mo: bool = False
 
     @abstractmethod
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Evaluate the objective on the samples.
 
         Args:
@@ -265,7 +266,7 @@ def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
         pass  # pragma: no cover
 
     def __call__(
-        self, samples: Tensor, X: Optional[Tensor] = None, *args, **kwargs
+        self, samples: Tensor, X: Tensor | None = None, *args, **kwargs
     ) -> Tensor:
         output = super().__call__(samples=samples, X=X, *args, **kwargs)
         # q-batch dimension is at -1 for single-output objectives and at
@@ -294,7 +295,7 @@ class IdentityMCObjective(MCAcquisitionObjective):
         >>> objective = identity_objective(samples)
     """
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         return samples.squeeze(-1)
 
 
@@ -324,7 +325,7 @@ def __init__(self, weights: Tensor) -> None:
             raise ValueError("weights must be a one-dimensional tensor.")
         self.register_buffer("weights", weights)
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Evaluate the linear objective on the samples.
 
         Args:
@@ -356,7 +357,7 @@ class GenericMCObjective(MCAcquisitionObjective):
         >>> objective = generic_objective(samples)
     """
 
-    def __init__(self, objective: Callable[[Tensor, Optional[Tensor]], Tensor]) -> None:
+    def __init__(self, objective: Callable[[Tensor, Tensor | None], Tensor]) -> None:
         r"""
         Args:
             objective: A callable `f(samples, X)` mapping a
@@ -367,7 +368,7 @@ def __init__(self, objective: Callable[[Tensor, Optional[Tensor]], Tensor]) -> N
         super().__init__()
         self.objective = objective
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Evaluate the objective on the samples.
 
         Args:
@@ -411,10 +412,10 @@ class ConstrainedMCObjective(GenericMCObjective):
 
     def __init__(
         self,
-        objective: Callable[[Tensor, Optional[Tensor]], Tensor],
+        objective: Callable[[Tensor, Tensor | None], Tensor],
         constraints: list[Callable[[Tensor], Tensor]],
-        infeasible_cost: Union[Tensor, float] = 0.0,
-        eta: Union[Tensor, float] = 1e-3,
+        infeasible_cost: Tensor | float = 0.0,
+        eta: Tensor | float = 1e-3,
     ) -> None:
         r"""
         Args:
@@ -442,7 +443,7 @@ def __init__(
         self.register_buffer("eta", eta)
         self.register_buffer("infeasible_cost", torch.as_tensor(infeasible_cost))
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Evaluate the feasibility-weighted objective on the samples.
 
         Args:
@@ -489,8 +490,8 @@ class LearnedObjective(MCAcquisitionObjective):
     def __init__(
         self,
         pref_model: Model,
-        sample_shape: Optional[torch.Size] = None,
-        seed: Optional[int] = None,
+        sample_shape: torch.Size | None = None,
+        seed: int | None = None,
     ):
         r"""
         Args:
@@ -524,7 +525,7 @@ def __init__(
             self.sampler = IIDNormalSampler(sample_shape=sample_shape, seed=seed)
             self.sampler.batch_range_override = (1, -1)
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Sample each element of samples.
 
         Args:
diff --git a/botorch/acquisition/penalized.py b/botorch/acquisition/penalized.py
index c01b1927eb..48695eb606 100644
--- a/botorch/acquisition/penalized.py
+++ b/botorch/acquisition/penalized.py
@@ -11,7 +11,8 @@
 from __future__ import annotations
 
 import math
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+from typing import Any
 
 import torch
 from botorch.acquisition.acquisition import AcquisitionFunction
@@ -233,10 +234,10 @@ def forward(self, X: Tensor) -> Tensor:
         return raw_value - self.regularization_parameter * penalty_term
 
     @property
-    def X_pending(self) -> Optional[Tensor]:
+    def X_pending(self) -> Tensor | None:
         return self.raw_acqf.X_pending
 
-    def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
+    def set_X_pending(self, X_pending: Tensor | None = None) -> None:
         if not isinstance(self.raw_acqf, AnalyticAcquisitionFunction):
             self.raw_acqf.set_X_pending(X_pending=X_pending)
         else:
@@ -321,10 +322,10 @@ class PenalizedMCObjective(GenericMCObjective):
 
     def __init__(
         self,
-        objective: Callable[[Tensor, Optional[Tensor]], Tensor],
+        objective: Callable[[Tensor, Tensor | None], Tensor],
         penalty_objective: torch.nn.Module,
         regularization_parameter: float,
-        expand_dim: Optional[int] = None,
+        expand_dim: int | None = None,
     ) -> None:
         r"""Penalized MC objective.
 
@@ -345,7 +346,7 @@ def __init__(
         self.regularization_parameter = regularization_parameter
         self.expand_dim = expand_dim
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Evaluate the penalized objective on the samples.
 
         Args:
diff --git a/botorch/acquisition/predictive_entropy_search.py b/botorch/acquisition/predictive_entropy_search.py
index c06e98a562..8a054f1c42 100644
--- a/botorch/acquisition/predictive_entropy_search.py
+++ b/botorch/acquisition/predictive_entropy_search.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 from botorch.acquisition.multi_objective.predictive_entropy_search import (
     qMultiObjectivePredictiveEntropySearch,
@@ -48,7 +47,7 @@ def __init__(
         model: Model,
         optimal_inputs: Tensor,
         maximize: bool = True,
-        X_pending: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
         max_ep_iterations: int = 250,
         ep_jitter: float = 1e-4,
         test_jitter: float = 1e-4,
diff --git a/botorch/acquisition/preference.py b/botorch/acquisition/preference.py
index c35affb3b2..6455650fb0 100644
--- a/botorch/acquisition/preference.py
+++ b/botorch/acquisition/preference.py
@@ -28,7 +28,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from botorch.acquisition import AnalyticAcquisitionFunction
@@ -55,8 +54,8 @@ class AnalyticExpectedUtilityOfBestOption(AnalyticAcquisitionFunction):
     def __init__(
         self,
         pref_model: Model,
-        outcome_model: Optional[DeterministicModel] = None,
-        previous_winner: Optional[Tensor] = None,
+        outcome_model: DeterministicModel | None = None,
+        previous_winner: Tensor | None = None,
     ) -> None:
         r"""Analytic implementation of Expected Utility of the Best Option under the
         Laplace model (assumes a PairwiseGP is used as the preference model) as
@@ -145,11 +144,11 @@ class qExpectedUtilityOfBestOption(MCAcquisitionFunction):
     def __init__(
         self,
         pref_model: Model,
-        outcome_model: Optional[DeterministicModel] = None,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
-        X_pending: Optional[Tensor] = None,
+        outcome_model: DeterministicModel | None = None,
+        sampler: MCSampler | None = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
+        X_pending: Tensor | None = None,
     ) -> None:
         r"""MC-based Expected Utility of Best Option (qEUBO) as proposed
         in [Astudillo2023qeubo]_.
@@ -208,9 +207,9 @@ class PairwiseBayesianActiveLearningByDisagreement(MCAcquisitionFunction):
     def __init__(
         self,
         pref_model: Model,
-        outcome_model: Optional[DeterministicModel] = None,
-        num_samples: Optional[int] = 1024,
-        std_noise: Optional[float] = 0.0,
+        outcome_model: DeterministicModel | None = None,
+        num_samples: int | None = 1024,
+        std_noise: float | None = 0.0,
     ) -> None:
         """
         Monte Carlo implementation of Bayesian Active Learning by Disagreement (BALD)
diff --git a/botorch/acquisition/prior_guided.py b/botorch/acquisition/prior_guided.py
index d7fa9d81c0..6587b7442f 100644
--- a/botorch/acquisition/prior_guided.py
+++ b/botorch/acquisition/prior_guided.py
@@ -17,7 +17,6 @@
 """
 from __future__ import annotations
 
-from typing import Optional
 
 from botorch.acquisition.acquisition import AcquisitionFunction
 from botorch.acquisition.monte_carlo import SampleReducingMCAcquisitionFunction
@@ -43,7 +42,7 @@ def __init__(
         prior_module: Module,
         log: bool = False,
         prior_exponent: float = 1.0,
-        X_pending: Optional[Tensor] = None,
+        X_pending: Tensor | None = None,
     ) -> None:
         r"""Initialize the prior-guided acquisition function.
 
diff --git a/botorch/acquisition/proximal.py b/botorch/acquisition/proximal.py
index 9cd4aed7ad..570c8469c9 100644
--- a/botorch/acquisition/proximal.py
+++ b/botorch/acquisition/proximal.py
@@ -11,7 +11,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from botorch.acquisition import AcquisitionFunction
@@ -52,8 +51,8 @@ def __init__(
         self,
         acq_function: AcquisitionFunction,
         proximal_weights: Tensor,
-        transformed_weighting: Optional[bool] = True,
-        beta: Optional[float] = None,
+        transformed_weighting: bool | None = True,
+        beta: float | None = None,
     ) -> None:
         r"""Derived Acquisition Function weighted by proximity to recently
         observed point.
@@ -211,7 +210,7 @@ def _validate_model(model: Model, proximal_weights: Tensor) -> None:
         )
 
 
-def _get_input_transform(model: Model) -> Optional[InputTransform]:
+def _get_input_transform(model: Model) -> InputTransform | None:
     """get input transform if defined"""
     try:
         return model.input_transform
diff --git a/botorch/acquisition/risk_measures.py b/botorch/acquisition/risk_measures.py
index 0ef5361c36..e82e5caf89 100644
--- a/botorch/acquisition/risk_measures.py
+++ b/botorch/acquisition/risk_measures.py
@@ -21,8 +21,8 @@
 """
 
 from abc import ABC, abstractmethod
+from collections.abc import Callable
 from math import ceil
-from typing import Callable, Optional
 
 import torch
 from botorch.acquisition.multi_objective.objective import IdentityMCMultiOutputObjective
@@ -49,7 +49,7 @@ class RiskMeasureMCObjective(MCAcquisitionObjective, ABC):
     def __init__(
         self,
         n_w: int,
-        preprocessing_function: Optional[Callable[[Tensor], Tensor]] = None,
+        preprocessing_function: Callable[[Tensor], Tensor] | None = None,
     ) -> None:
         r"""Transform the posterior samples to samples of a risk measure.
 
@@ -94,7 +94,7 @@ def _prepare_samples(self, samples: Tensor) -> Tensor:
         return samples.view(*samples.shape[:-1], -1, self.n_w)
 
     @abstractmethod
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the risk measure corresponding to the given samples.
 
         Args:
@@ -127,7 +127,7 @@ def __init__(
         self,
         alpha: float,
         n_w: int,
-        preprocessing_function: Optional[Callable[[Tensor], Tensor]] = None,
+        preprocessing_function: Callable[[Tensor], Tensor] | None = None,
     ) -> None:
         r"""Transform the posterior samples to samples of a risk measure.
 
@@ -147,7 +147,7 @@ def __init__(
         self.alpha = alpha
         self.alpha_idx = ceil(n_w * alpha) - 1
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the CVaR corresponding to the given samples.
 
         Args:
@@ -181,7 +181,7 @@ def __init__(
         self,
         alpha: float,
         n_w: int,
-        preprocessing_function: Optional[Callable[[Tensor], Tensor]] = None,
+        preprocessing_function: Callable[[Tensor], Tensor] | None = None,
     ) -> None:
         r"""Transform the posterior samples to samples of a risk measure.
 
@@ -202,7 +202,7 @@ def __init__(
         )
         self._q = 1 - self.alpha_idx / n_w
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the VaR corresponding to the given samples.
 
         Args:
@@ -234,7 +234,7 @@ def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
 class WorstCase(RiskMeasureMCObjective):
     r"""The worst-case risk measure."""
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the worst-case measure corresponding to the given samples.
 
         Args:
@@ -259,7 +259,7 @@ class Expectation(RiskMeasureMCObjective):
     reducing the cost of posterior sampling as a result.
     """
 
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         r"""Calculate the expectation corresponding to the given samples.
         This calculates the expectation / mean / average of each `n_w` samples
         across the q-batch dimension. If `self.weights` is given, the samples
diff --git a/botorch/acquisition/thompson_sampling.py b/botorch/acquisition/thompson_sampling.py
index 7f3d1c40e9..ad596bc2d5 100644
--- a/botorch/acquisition/thompson_sampling.py
+++ b/botorch/acquisition/thompson_sampling.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional
 
 import torch
 from botorch.acquisition.analytic import AcquisitionFunction
@@ -33,7 +32,7 @@ class PathwiseThompsonSampling(AcquisitionFunction):
     def __init__(
         self,
         model: Model,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> None:
         r"""Single-outcome TS.
 
@@ -49,7 +48,7 @@ def __init__(
             )
 
         super().__init__(model=model)
-        self.batch_size: Optional[int] = None
+        self.batch_size: int | None = None
 
     def redraw(self) -> None:
         self.samples = get_matheron_path_model(
diff --git a/botorch/acquisition/utils.py b/botorch/acquisition/utils.py
index ae4f054321..e53bc0a7d0 100644
--- a/botorch/acquisition/utils.py
+++ b/botorch/acquisition/utils.py
@@ -11,7 +11,7 @@
 from __future__ import annotations
 
 import math
-from typing import Callable, Optional
+from collections.abc import Callable
 
 import torch
 from botorch.acquisition.objective import (
@@ -90,12 +90,12 @@ def repeat_to_match_aug_dim(target_tensor: Tensor, reference_tensor: Tensor) ->
 def compute_best_feasible_objective(
     samples: Tensor,
     obj: Tensor,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]],
-    model: Optional[Model] = None,
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    X_baseline: Optional[Tensor] = None,
-    infeasible_obj: Optional[Tensor] = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None,
+    model: Model | None = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    X_baseline: Tensor | None = None,
+    infeasible_obj: Tensor | None = None,
 ) -> Tensor:
     """Computes the largest `obj` value that is feasible under the `constraints`. If
     `constraints` is None, returns the best unconstrained objective value.
@@ -165,8 +165,8 @@ def compute_best_feasible_objective(
 
 def _estimate_objective_lower_bound(
     model: Model,
-    objective: Optional[MCAcquisitionObjective],
-    posterior_transform: Optional[PosteriorTransform],
+    objective: MCAcquisitionObjective | None,
+    posterior_transform: PosteriorTransform | None,
     X: Tensor,
 ) -> Tensor:
     """Estimates a lower bound on the objective values by evaluating the model at convex
@@ -203,8 +203,8 @@ def _estimate_objective_lower_bound(
 def get_infeasible_cost(
     X: Tensor,
     model: Model,
-    objective: Optional[Callable[[Tensor, Optional[Tensor]], Tensor]] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
+    objective: Callable[[Tensor, Tensor | None], Tensor] | None = None,
+    posterior_transform: PosteriorTransform | None = None,
 ) -> Tensor:
     r"""Get infeasible cost for a model and objective.
 
@@ -229,7 +229,7 @@ def get_infeasible_cost(
     """
     if objective is None:
 
-        def objective(Y: Tensor, X: Optional[Tensor] = None):
+        def objective(Y: Tensor, X: Tensor | None = None):
             return Y.squeeze(-1)
 
     posterior = model.posterior(X, posterior_transform=posterior_transform)
@@ -245,13 +245,13 @@ def objective(Y: Tensor, X: Optional[Tensor] = None):
 def prune_inferior_points(
     model: Model,
     X: Tensor,
-    objective: Optional[MCAcquisitionObjective] = None,
-    posterior_transform: Optional[PosteriorTransform] = None,
-    constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
+    objective: MCAcquisitionObjective | None = None,
+    posterior_transform: PosteriorTransform | None = None,
+    constraints: list[Callable[[Tensor], Tensor]] | None = None,
     num_samples: int = 2048,
     max_frac: float = 1.0,
-    sampler: Optional[MCSampler] = None,
-    marginalize_dim: Optional[int] = None,
+    sampler: MCSampler | None = None,
+    marginalize_dim: int | None = None,
 ) -> Tensor:
     r"""Prune points from an input tensor that are unlikely to be the best point.
 
@@ -352,8 +352,8 @@ def prune_inferior_points(
 
 def project_to_target_fidelity(
     X: Tensor,
-    target_fidelities: Optional[dict[int, float]] = None,
-    d: Optional[int] = None,
+    target_fidelities: dict[int, float] | None = None,
+    d: int | None = None,
 ) -> Tensor:
     r"""Project `X` onto the target set of fidelities.
 
@@ -414,7 +414,7 @@ def project_to_target_fidelity(
 
 
 def expand_trace_observations(
-    X: Tensor, fidelity_dims: Optional[list[int]] = None, num_trace_obs: int = 0
+    X: Tensor, fidelity_dims: list[int] | None = None, num_trace_obs: int = 0
 ) -> Tensor:
     r"""Expand `X` with trace observations.
 
diff --git a/botorch/cross_validation.py b/botorch/cross_validation.py
index 5478a1a539..613888f795 100644
--- a/botorch/cross_validation.py
+++ b/botorch/cross_validation.py
@@ -10,7 +10,7 @@
 
 from __future__ import annotations
 
-from typing import Any, NamedTuple, Optional
+from typing import Any, NamedTuple
 
 import torch
 from botorch.exceptions.errors import UnsupportedError
@@ -27,19 +27,19 @@ class CVFolds(NamedTuple):
     test_X: Tensor
     train_Y: Tensor
     test_Y: Tensor
-    train_Yvar: Optional[Tensor] = None
-    test_Yvar: Optional[Tensor] = None
+    train_Yvar: Tensor | None = None
+    test_Yvar: Tensor | None = None
 
 
 class CVResults(NamedTuple):
     model: GPyTorchModel
     posterior: GPyTorchPosterior
     observed_Y: Tensor
-    observed_Yvar: Optional[Tensor] = None
+    observed_Yvar: Tensor | None = None
 
 
 def gen_loo_cv_folds(
-    train_X: Tensor, train_Y: Tensor, train_Yvar: Optional[Tensor] = None
+    train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor | None = None
 ) -> CVFolds:
     r"""Generate LOO CV folds w.r.t. to `n`.
 
@@ -112,9 +112,9 @@ def batch_cross_validation(
     model_cls: type[GPyTorchModel],
     mll_cls: type[MarginalLogLikelihood],
     cv_folds: CVFolds,
-    fit_args: Optional[dict[str, Any]] = None,
+    fit_args: dict[str, Any] | None = None,
     observation_noise: bool = False,
-    model_init_kwargs: Optional[dict[str, Any]] = None,
+    model_init_kwargs: dict[str, Any] | None = None,
 ) -> CVResults:
     r"""Perform cross validation by using GPyTorch batch mode.
 
diff --git a/botorch/fit.py b/botorch/fit.py
index 44c74618bd..1df72330e2 100644
--- a/botorch/fit.py
+++ b/botorch/fit.py
@@ -9,11 +9,11 @@
 from __future__ import annotations
 
 import logging
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from copy import deepcopy
 from functools import partial
 from itertools import filterfalse
-from typing import Any, Callable, Optional, Union
+from typing import Any
 from warnings import catch_warnings, simplefilter, warn_explicit, WarningMessage
 
 from botorch.exceptions.errors import ModelFittingError, UnsupportedError
@@ -74,10 +74,10 @@ def _rethrow_warn(w: WarningMessage) -> bool:
 
 def fit_gpytorch_mll(
     mll: MarginalLogLikelihood,
-    closure: Optional[Callable[[], tuple[Tensor, Sequence[Optional[Tensor]]]]] = None,
-    optimizer: Optional[Callable] = None,
-    closure_kwargs: Optional[dict[str, Any]] = None,
-    optimizer_kwargs: Optional[dict[str, Any]] = None,
+    closure: Callable[[], tuple[Tensor, Sequence[Tensor | None]]] | None = None,
+    optimizer: Callable | None = None,
+    closure_kwargs: dict[str, Any] | None = None,
+    optimizer_kwargs: dict[str, Any] | None = None,
     **kwargs: Any,
 ) -> MarginalLogLikelihood:
     r"""Clearing house for fitting models passed as GPyTorch MarginalLogLikelihoods.
@@ -119,10 +119,10 @@ def _fit_fallback(
     _: type[object],
     __: type[object],
     *,
-    closure: Optional[Callable[[], tuple[Tensor, Sequence[Optional[Tensor]]]]] = None,
+    closure: Callable[[], tuple[Tensor, Sequence[Tensor | None]]] | None = None,
     optimizer: Callable = fit_gpytorch_mll_scipy,
-    closure_kwargs: Optional[dict[str, Any]] = None,
-    optimizer_kwargs: Optional[dict[str, Any]] = None,
+    closure_kwargs: dict[str, Any] | None = None,
+    optimizer_kwargs: dict[str, Any] | None = None,
     max_attempts: int = 5,
     pick_best_of_all_attempts: bool = False,
     warning_handler: Callable[[WarningMessage], bool] = DEFAULT_WARNING_HANDLER,
@@ -289,9 +289,9 @@ def _fit_fallback_approximate(
     _: type[Likelihood],
     __: type[ApproximateGPyTorchModel],
     *,
-    closure: Optional[Callable[[], tuple[Tensor, Sequence[Optional[Tensor]]]]] = None,
-    data_loader: Optional[DataLoader] = None,
-    optimizer: Optional[Callable] = None,
+    closure: Callable[[], tuple[Tensor, Sequence[Tensor | None]]] | None = None,
+    data_loader: DataLoader | None = None,
+    optimizer: Callable | None = None,
     full_batch_limit: int = 1024,
     **kwargs: Any,
 ) -> _ApproximateMarginalLogLikelihood:
@@ -333,7 +333,7 @@ def _fit_fallback_approximate(
 
 
 def fit_fully_bayesian_model_nuts(
-    model: Union[SaasFullyBayesianSingleTaskGP, SaasFullyBayesianMultiTaskGP],
+    model: SaasFullyBayesianSingleTaskGP | SaasFullyBayesianMultiTaskGP,
     max_tree_depth: int = 6,
     warmup_steps: int = 512,
     num_samples: int = 256,
diff --git a/botorch/generation/gen.py b/botorch/generation/gen.py
index eb5ea1ba68..b15039aec6 100644
--- a/botorch/generation/gen.py
+++ b/botorch/generation/gen.py
@@ -12,8 +12,9 @@
 
 import time
 import warnings
+from collections.abc import Callable
 from functools import partial
-from typing import Any, Callable, NoReturn, Optional, Union
+from typing import Any, NoReturn
 
 import numpy as np
 import torch
@@ -45,14 +46,14 @@
 def gen_candidates_scipy(
     initial_conditions: Tensor,
     acquisition_function: AcquisitionFunction,
-    lower_bounds: Optional[Union[float, Tensor]] = None,
-    upper_bounds: Optional[Union[float, Tensor]] = None,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    nonlinear_inequality_constraints: Optional[list[tuple[Callable, bool]]] = None,
-    options: Optional[dict[str, Any]] = None,
-    fixed_features: Optional[dict[int, Optional[float]]] = None,
-    timeout_sec: Optional[float] = None,
+    lower_bounds: float | Tensor | None = None,
+    upper_bounds: float | Tensor | None = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    nonlinear_inequality_constraints: list[tuple[Callable, bool]] | None = None,
+    options: dict[str, Any] | None = None,
+    fixed_features: dict[int, float | None] | None = None,
+    timeout_sec: float | None = None,
 ) -> tuple[Tensor, Tensor]:
     r"""Generate a set of candidates using `scipy.optimize.minimize`.
 
@@ -300,13 +301,13 @@ def f(x):
 def gen_candidates_torch(
     initial_conditions: Tensor,
     acquisition_function: AcquisitionFunction,
-    lower_bounds: Optional[Union[float, Tensor]] = None,
-    upper_bounds: Optional[Union[float, Tensor]] = None,
+    lower_bounds: float | Tensor | None = None,
+    upper_bounds: float | Tensor | None = None,
     optimizer: type[Optimizer] = torch.optim.Adam,
-    options: Optional[dict[str, Union[float, str]]] = None,
-    callback: Optional[Callable[[int, Tensor, Tensor], NoReturn]] = None,
-    fixed_features: Optional[dict[int, Optional[float]]] = None,
-    timeout_sec: Optional[float] = None,
+    options: dict[str, float | str] | None = None,
+    callback: Callable[[int, Tensor, Tensor], NoReturn] | None = None,
+    fixed_features: dict[int, float | None] | None = None,
+    timeout_sec: float | None = None,
 ) -> tuple[Tensor, Tensor]:
     r"""Generate a set of candidates using a `torch.optim` optimizer.
 
diff --git a/botorch/generation/sampling.py b/botorch/generation/sampling.py
index 9347eebf30..9dc9c2d297 100644
--- a/botorch/generation/sampling.py
+++ b/botorch/generation/sampling.py
@@ -17,7 +17,6 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Optional, Union
 
 import torch
 from botorch.acquisition.acquisition import AcquisitionFunction
@@ -69,8 +68,8 @@ class MaxPosteriorSampling(SamplingStrategy):
     def __init__(
         self,
         model: Model,
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
         replacement: bool = True,
     ) -> None:
         r"""Constructor for the SamplingStrategy base class.
@@ -243,9 +242,9 @@ class ConstrainedMaxPosteriorSampling(MaxPosteriorSampling):
     def __init__(
         self,
         model: Model,
-        constraint_model: Union[ModelListGP, MultiTaskGP],
-        objective: Optional[MCAcquisitionObjective] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        constraint_model: ModelListGP | MultiTaskGP,
+        objective: MCAcquisitionObjective | None = None,
+        posterior_transform: PosteriorTransform | None = None,
         replacement: bool = True,
     ) -> None:
         r"""Constructor for the SamplingStrategy base class.
diff --git a/botorch/generation/utils.py b/botorch/generation/utils.py
index b4cb0ad5ad..da349e2612 100644
--- a/botorch/generation/utils.py
+++ b/botorch/generation/utils.py
@@ -7,8 +7,8 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable, Optional, Union
 
 import torch
 
@@ -21,7 +21,7 @@
 
 
 def _convert_nonlinear_inequality_constraints(
-    nonlinear_inequality_constraints: list[Union[Callable, tuple[Callable, bool]]]
+    nonlinear_inequality_constraints: list[Callable | tuple[Callable, bool]]
 ) -> list[tuple[Callable, bool]]:
     """Convert legacy defintions of nonlinear inequality constraints into the new
     format. Assumes intra-point constraints.
@@ -92,22 +92,22 @@ class _NoFixedFeatures:
 
     acquisition_function: FixedFeatureAcquisitionFunction
     initial_conditions: Tensor
-    lower_bounds: Optional[Union[float, Tensor]]
-    upper_bounds: Optional[Union[float, Tensor]]
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]]
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]]
-    nonlinear_inequality_constraints: Optional[list[Callable[[Tensor], Tensor]]]
+    lower_bounds: float | Tensor | None
+    upper_bounds: float | Tensor | None
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None
+    nonlinear_inequality_constraints: list[Callable[[Tensor], Tensor]] | None
 
 
 def _remove_fixed_features_from_optimization(
-    fixed_features: dict[int, Optional[float]],
+    fixed_features: dict[int, float | None],
     acquisition_function: AcquisitionFunction,
     initial_conditions: Tensor,
-    lower_bounds: Optional[Union[float, Tensor]],
-    upper_bounds: Optional[Union[float, Tensor]],
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]],
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]],
-    nonlinear_inequality_constraints: Optional[list[Callable[[Tensor], Tensor]]],
+    lower_bounds: float | Tensor | None,
+    upper_bounds: float | Tensor | None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None,
+    nonlinear_inequality_constraints: list[Callable[[Tensor], Tensor]] | None,
 ) -> _NoFixedFeatures:
     """
     Given a set of non-empty fixed features, this function effectively reduces the
diff --git a/botorch/models/approximate_gp.py b/botorch/models/approximate_gp.py
index 1cf32d5ccf..c934196d1b 100644
--- a/botorch/models/approximate_gp.py
+++ b/botorch/models/approximate_gp.py
@@ -31,7 +31,6 @@
 
 import copy
 import warnings
-from typing import Optional, Union
 
 import torch
 from botorch.acquisition.objective import PosteriorTransform
@@ -92,8 +91,8 @@ class ApproximateGPyTorchModel(GPyTorchModel):
 
     def __init__(
         self,
-        model: Optional[ApproximateGP] = None,
-        likelihood: Optional[Likelihood] = None,
+        model: ApproximateGP | None = None,
+        likelihood: Likelihood | None = None,
         num_outputs: int = 1,
         *args,
         **kwargs,
@@ -148,9 +147,9 @@ def train(self, mode: bool = True) -> Self:
     def posterior(
         self,
         X,
-        output_indices: Optional[list[int]] = None,
+        output_indices: list[int] | None = None,
         observation_noise: bool = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> GPyTorchPosterior:
         if output_indices is not None:
             raise NotImplementedError(  # pragma: no cover
@@ -196,15 +195,15 @@ class _SingleTaskVariationalGP(ApproximateGP):
     def __init__(
         self,
         train_X: Tensor,
-        train_Y: Optional[Tensor] = None,
+        train_Y: Tensor | None = None,
         num_outputs: int = 1,
         learn_inducing_points=True,
-        covar_module: Optional[Kernel] = None,
-        mean_module: Optional[Mean] = None,
-        variational_distribution: Optional[_VariationalDistribution] = None,
+        covar_module: Kernel | None = None,
+        mean_module: Mean | None = None,
+        variational_distribution: _VariationalDistribution | None = None,
         variational_strategy: type[_VariationalStrategy] = VariationalStrategy,
-        inducing_points: Optional[Union[Tensor, int]] = None,
-        inducing_point_allocator: Optional[InducingPointAllocator] = None,
+        inducing_points: Tensor | int | None = None,
+        inducing_point_allocator: InducingPointAllocator | None = None,
     ) -> None:
         r"""
         Args:
@@ -335,16 +334,16 @@ class SingleTaskVariationalGP(ApproximateGPyTorchModel):
     def __init__(
         self,
         train_X: Tensor,
-        train_Y: Optional[Tensor] = None,
-        likelihood: Optional[Likelihood] = None,
+        train_Y: Tensor | None = None,
+        likelihood: Likelihood | None = None,
         num_outputs: int = 1,
         learn_inducing_points: bool = True,
-        covar_module: Optional[Kernel] = None,
-        mean_module: Optional[Mean] = None,
-        variational_distribution: Optional[_VariationalDistribution] = None,
+        covar_module: Kernel | None = None,
+        mean_module: Mean | None = None,
+        variational_distribution: _VariationalDistribution | None = None,
         variational_strategy: type[_VariationalStrategy] = VariationalStrategy,
-        inducing_points: Optional[Union[Tensor, int]] = None,
-        inducing_point_allocator: Optional[InducingPointAllocator] = None,
+        inducing_points: Tensor | int | None = None,
+        inducing_point_allocator: InducingPointAllocator | None = None,
         outcome_transform: OutcomeTransform | None = None,
         input_transform: InputTransform | None = None,
     ) -> None:
diff --git a/botorch/models/contextual.py b/botorch/models/contextual.py
index 34d8eb51b7..4dd5d6aed1 100644
--- a/botorch/models/contextual.py
+++ b/botorch/models/contextual.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional
+from typing import Any
 
 from botorch.models.gp_regression import SingleTaskGP
 from botorch.models.kernels.contextual_lcea import LCEAKernel
@@ -20,7 +20,7 @@ def __init__(
         self,
         train_X: Tensor,
         train_Y: Tensor,
-        train_Yvar: Optional[Tensor],
+        train_Yvar: Tensor | None,
         decomposition: dict[str, list[int]],
     ) -> None:
         r"""
@@ -73,13 +73,13 @@ def __init__(
         self,
         train_X: Tensor,
         train_Y: Tensor,
-        train_Yvar: Optional[Tensor],
+        train_Yvar: Tensor | None,
         decomposition: dict[str, list[int]],
         train_embedding: bool = True,
-        cat_feature_dict: Optional[dict] = None,
-        embs_feature_dict: Optional[dict] = None,
-        embs_dim_list: Optional[list[int]] = None,
-        context_weight_dict: Optional[dict] = None,
+        cat_feature_dict: dict | None = None,
+        embs_feature_dict: dict | None = None,
+        embs_dim_list: list[int] | None = None,
+        context_weight_dict: dict | None = None,
     ) -> None:
         r"""
         Args:
@@ -127,10 +127,10 @@ def construct_inputs(
         training_data: SupervisedDataset,
         decomposition: dict[str, list[str]],
         train_embedding: bool = True,
-        cat_feature_dict: Optional[dict] = None,
-        embs_feature_dict: Optional[dict] = None,
-        embs_dim_list: Optional[list[int]] = None,
-        context_weight_dict: Optional[dict] = None,
+        cat_feature_dict: dict | None = None,
+        embs_feature_dict: dict | None = None,
+        embs_dim_list: list[int] | None = None,
+        context_weight_dict: dict | None = None,
     ) -> dict[str, Any]:
         r"""Construct `Model` keyword arguments from a dict of `SupervisedDataset`.
 
diff --git a/botorch/models/contextual_multioutput.py b/botorch/models/contextual_multioutput.py
index e303315d7b..53315acb4a 100644
--- a/botorch/models/contextual_multioutput.py
+++ b/botorch/models/contextual_multioutput.py
@@ -13,7 +13,7 @@
     Advances in Neural Information Processing Systems 33, NeurIPS 2020.
 """
 
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from botorch.models.multitask import MultiTaskGP
@@ -42,17 +42,17 @@ def __init__(
         train_X: Tensor,
         train_Y: Tensor,
         task_feature: int,
-        train_Yvar: Optional[Tensor] = None,
-        mean_module: Optional[Module] = None,
-        covar_module: Optional[Module] = None,
-        likelihood: Optional[Likelihood] = None,
-        context_cat_feature: Optional[Tensor] = None,
-        context_emb_feature: Optional[Tensor] = None,
-        embs_dim_list: Optional[list[int]] = None,
-        output_tasks: Optional[list[int]] = None,
-        all_tasks: Optional[list[int]] = None,
+        train_Yvar: Tensor | None = None,
+        mean_module: Module | None = None,
+        covar_module: Module | None = None,
+        likelihood: Likelihood | None = None,
+        context_cat_feature: Tensor | None = None,
+        context_emb_feature: Tensor | None = None,
+        embs_dim_list: list[int] | None = None,
+        output_tasks: list[int] | None = None,
+        all_tasks: list[int] | None = None,
         outcome_transform: OutcomeTransform | _DefaultType | None = DEFAULT,
-        input_transform: Optional[InputTransform] = None,
+        input_transform: InputTransform | None = None,
     ) -> None:
         r"""
         Args:
@@ -211,12 +211,12 @@ def task_covar_module(self, task_idcs: Tensor) -> Tensor:
     @classmethod
     def construct_inputs(
         cls,
-        training_data: Union[SupervisedDataset, MultiTaskDataset],
+        training_data: SupervisedDataset | MultiTaskDataset,
         task_feature: int,
-        output_tasks: Optional[list[int]] = None,
-        context_cat_feature: Optional[Tensor] = None,
-        context_emb_feature: Optional[Tensor] = None,
-        embs_dim_list: Optional[list[int]] = None,
+        output_tasks: list[int] | None = None,
+        context_cat_feature: Tensor | None = None,
+        context_emb_feature: Tensor | None = None,
+        embs_dim_list: list[int] | None = None,
         **kwargs,
     ) -> dict[str, Any]:
         r"""Construct `Model` keyword arguments from a dataset and other args.
diff --git a/botorch/models/converter.py b/botorch/models/converter.py
index 9d898289f0..276a40623f 100644
--- a/botorch/models/converter.py
+++ b/botorch/models/converter.py
@@ -12,7 +12,6 @@
 
 import warnings
 from copy import deepcopy
-from typing import Optional
 
 import torch
 from botorch.exceptions import UnsupportedError
@@ -440,8 +439,8 @@ def batched_multi_output_to_single_output(
 
 def _get_adjusted_batch_keys(
     batch_state_dict: dict[str, Tensor],
-    input_transform: Optional[InputTransform],
-    outcome_transform: Optional[OutcomeTransform] = None,
+    input_transform: InputTransform | None,
+    outcome_transform: OutcomeTransform | None = None,
 ) -> tuple[set[str], set[str]]:
     r"""Group the keys based on whether the value requires batch shape changes.
 
diff --git a/botorch/models/cost.py b/botorch/models/cost.py
index 17b1998a61..5868c7e683 100644
--- a/botorch/models/cost.py
+++ b/botorch/models/cost.py
@@ -15,7 +15,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from botorch.models.deterministic import DeterministicModel
@@ -45,7 +44,7 @@ class AffineFidelityCostModel(DeterministicModel):
 
     def __init__(
         self,
-        fidelity_weights: Optional[dict[int, float]] = None,
+        fidelity_weights: dict[int, float] | None = None,
         fixed_cost: float = 0.01,
     ) -> None:
         r"""
diff --git a/botorch/models/deterministic.py b/botorch/models/deterministic.py
index 8c7b9a2436..31de8d29df 100644
--- a/botorch/models/deterministic.py
+++ b/botorch/models/deterministic.py
@@ -27,7 +27,7 @@
 from __future__ import annotations
 
 from abc import abstractmethod
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import torch
 from botorch.models.ensemble import EnsembleModel
@@ -106,7 +106,7 @@ def forward(self, X: Tensor) -> Tensor:
 class AffineDeterministicModel(DeterministicModel):
     r"""An affine deterministic model."""
 
-    def __init__(self, a: Tensor, b: Union[Tensor, float] = 0.01) -> None:
+    def __init__(self, a: Tensor, b: Tensor | float = 0.01) -> None:
         r"""Affine deterministic model from weights and offset terms.
 
         A simple model of the form
@@ -177,11 +177,11 @@ class FixedSingleSampleModel(DeterministicModel):
     def __init__(
         self,
         model: Model,
-        w: Optional[Tensor] = None,
-        dim: Optional[int] = None,
-        jitter: Optional[float] = 1e-8,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.dtype] = None,
+        w: Tensor | None = None,
+        dim: int | None = None,
+        jitter: float | None = 1e-8,
+        dtype: torch.dtype | None = None,
+        device: torch.dtype | None = None,
     ) -> None:
         r"""
         Args:
diff --git a/botorch/models/ensemble.py b/botorch/models/ensemble.py
index abf896f693..9ac9b945a5 100644
--- a/botorch/models/ensemble.py
+++ b/botorch/models/ensemble.py
@@ -12,7 +12,7 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Any, Optional
+from typing import Any
 
 from botorch.acquisition.objective import PosteriorTransform
 from botorch.exceptions.errors import UnsupportedError
@@ -48,8 +48,8 @@ def num_outputs(self) -> int:
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        output_indices: list[int] | None = None,
+        posterior_transform: PosteriorTransform | None = None,
         **kwargs: Any,
     ) -> EnsemblePosterior:
         r"""Compute the ensemble posterior at X.
diff --git a/botorch/models/fully_bayesian.py b/botorch/models/fully_bayesian.py
index a2d98f196c..0b2ddffdf9 100644
--- a/botorch/models/fully_bayesian.py
+++ b/botorch/models/fully_bayesian.py
@@ -34,7 +34,7 @@
 import math
 from abc import abstractmethod
 from collections.abc import Mapping
-from typing import Any, Optional
+from typing import Any
 
 import pyro
 import torch
@@ -112,7 +112,7 @@ class PyroModel:
     """
 
     def set_inputs(
-        self, train_X: Tensor, train_Y: Tensor, train_Yvar: Optional[Tensor] = None
+        self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor | None = None
     ) -> None:
         """Set the training data.
 
@@ -160,7 +160,7 @@ class SaasPyroModel(PyroModel):
     """
 
     def set_inputs(
-        self, train_X: Tensor, train_Y: Tensor, train_Yvar: Optional[Tensor] = None
+        self, train_X: Tensor, train_Y: Tensor, train_Yvar: Tensor | None = None
     ) -> None:
         super().set_inputs(train_X, train_Y, train_Yvar)
         self.ard_num_dims = self.train_X.shape[-1]
@@ -337,10 +337,10 @@ def __init__(
         self,
         train_X: Tensor,
         train_Y: Tensor,
-        train_Yvar: Optional[Tensor] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
-        pyro_model: Optional[PyroModel] = None,
+        train_Yvar: Tensor | None = None,
+        outcome_transform: OutcomeTransform | None = None,
+        input_transform: InputTransform | None = None,
+        pyro_model: PyroModel | None = None,
     ) -> None:
         r"""Initialize the fully Bayesian single-task GP model.
 
@@ -509,9 +509,9 @@ def forward(self, X: Tensor) -> MultivariateNormal:
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
+        output_indices: list[int] | None = None,
         observation_noise: bool = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
         **kwargs: Any,
     ) -> GaussianMixturePosterior:
         r"""Computes the posterior over model outputs at the provided points.
diff --git a/botorch/models/fully_bayesian_multitask.py b/botorch/models/fully_bayesian_multitask.py
index 44a74f5e89..5e01b18d46 100644
--- a/botorch/models/fully_bayesian_multitask.py
+++ b/botorch/models/fully_bayesian_multitask.py
@@ -9,7 +9,7 @@
 
 
 from collections.abc import Mapping
-from typing import Any, NoReturn, Optional
+from typing import Any, NoReturn
 
 import pyro
 import torch
@@ -46,9 +46,9 @@ def set_inputs(
         self,
         train_X: Tensor,
         train_Y: Tensor,
-        train_Yvar: Optional[Tensor],
+        train_Yvar: Tensor | None,
         task_feature: int,
-        task_rank: Optional[int] = None,
+        task_rank: int | None = None,
     ) -> None:
         """Set the training data.
 
@@ -196,13 +196,13 @@ def __init__(
         train_X: Tensor,
         train_Y: Tensor,
         task_feature: int,
-        train_Yvar: Optional[Tensor] = None,
-        output_tasks: Optional[list[int]] = None,
-        rank: Optional[int] = None,
-        all_tasks: Optional[list[int]] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
-        pyro_model: Optional[MultitaskSaasPyroModel] = None,
+        train_Yvar: Tensor | None = None,
+        output_tasks: list[int] | None = None,
+        rank: int | None = None,
+        all_tasks: list[int] | None = None,
+        outcome_transform: OutcomeTransform | None = None,
+        input_transform: InputTransform | None = None,
+        pyro_model: MultitaskSaasPyroModel | None = None,
     ) -> None:
         r"""Initialize the fully Bayesian multi-task GP model.
 
@@ -348,9 +348,9 @@ def load_mcmc_samples(self, mcmc_samples: dict[str, Tensor]) -> None:
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
+        output_indices: list[int] | None = None,
         observation_noise: bool = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
         **kwargs: Any,
     ) -> GaussianMixturePosterior:
         r"""Computes the posterior over model outputs at the provided points.
diff --git a/botorch/models/gp_regression.py b/botorch/models/gp_regression.py
index dfb7a637b6..ee380f9e84 100644
--- a/botorch/models/gp_regression.py
+++ b/botorch/models/gp_regression.py
@@ -31,7 +31,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import NoReturn, Optional, Union
+from typing import NoReturn
 
 import torch
 from botorch.models.gpytorch import BatchedMultiOutputGPyTorchModel
@@ -131,12 +131,12 @@ def __init__(
         self,
         train_X: Tensor,
         train_Y: Tensor,
-        train_Yvar: Optional[Tensor] = None,
-        likelihood: Optional[Likelihood] = None,
-        covar_module: Optional[Module] = None,
-        mean_module: Optional[Mean] = None,
-        outcome_transform: Optional[Union[OutcomeTransform, _DefaultType]] = DEFAULT,
-        input_transform: Optional[InputTransform] = None,
+        train_Yvar: Tensor | None = None,
+        likelihood: Likelihood | None = None,
+        covar_module: Module | None = None,
+        mean_module: Mean | None = None,
+        outcome_transform: OutcomeTransform | _DefaultType | None = DEFAULT,
+        input_transform: InputTransform | None = None,
     ) -> None:
         r"""
         Args:
@@ -224,8 +224,8 @@ def __init__(
 
     @classmethod
     def construct_inputs(
-        cls, training_data: SupervisedDataset, *, task_feature: Optional[int] = None
-    ) -> dict[str, Union[BotorchContainer, Tensor]]:
+        cls, training_data: SupervisedDataset, *, task_feature: int | None = None
+    ) -> dict[str, BotorchContainer | Tensor]:
         r"""Construct `SingleTaskGP` keyword arguments from a `SupervisedDataset`.
 
         Args:
@@ -280,8 +280,8 @@ def __init__(
         train_X: Tensor,
         train_Y: Tensor,
         train_Yvar: Tensor,
-        outcome_transform: Optional[OutcomeTransform] = None,
-        input_transform: Optional[InputTransform] = None,
+        outcome_transform: OutcomeTransform | None = None,
+        input_transform: InputTransform | None = None,
     ) -> None:
         r"""
         Args:
diff --git a/botorch/models/gp_regression_fidelity.py b/botorch/models/gp_regression_fidelity.py
index 29532b86ff..add63558c7 100644
--- a/botorch/models/gp_regression_fidelity.py
+++ b/botorch/models/gp_regression_fidelity.py
@@ -25,7 +25,9 @@
 
 from __future__ import annotations
 
-from typing import Any, Sequence
+from collections.abc import Sequence
+
+from typing import Any
 
 import torch
 from botorch.exceptions.errors import UnsupportedError
diff --git a/botorch/models/gp_regression_mixed.py b/botorch/models/gp_regression_mixed.py
index 30941d27e3..93372a60e1 100644
--- a/botorch/models/gp_regression_mixed.py
+++ b/botorch/models/gp_regression_mixed.py
@@ -6,7 +6,9 @@
 
 from __future__ import annotations
 
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+
+from typing import Any
 
 import torch
 from botorch.models.gp_regression import SingleTaskGP
@@ -61,13 +63,13 @@ def __init__(
         train_X: Tensor,
         train_Y: Tensor,
         cat_dims: list[int],
-        train_Yvar: Optional[Tensor] = None,
-        cont_kernel_factory: Optional[
+        train_Yvar: Tensor | None = None,
+        cont_kernel_factory: None | (
             Callable[[torch.Size, int, list[int]], Kernel]
-        ] = None,
-        likelihood: Optional[Likelihood] = None,
-        outcome_transform: Optional[Union[OutcomeTransform, _DefaultType]] = DEFAULT,
-        input_transform: Optional[InputTransform] = None,  # TODO
+        ) = None,
+        likelihood: Likelihood | None = None,
+        outcome_transform: OutcomeTransform | _DefaultType | None = DEFAULT,
+        input_transform: InputTransform | None = None,  # TODO
     ) -> None:
         r"""A single-task exact GP model supporting categorical parameters.
 
@@ -164,7 +166,7 @@ def construct_inputs(
         cls,
         training_data: SupervisedDataset,
         categorical_features: list[int],
-        likelihood: Optional[Likelihood] = None,
+        likelihood: Likelihood | None = None,
     ) -> dict[str, Any]:
         r"""Construct `Model` keyword arguments from a dict of `SupervisedDataset`.
 
diff --git a/botorch/models/gpytorch.py b/botorch/models/gpytorch.py
index 3bc2059c14..c44616e0d9 100644
--- a/botorch/models/gpytorch.py
+++ b/botorch/models/gpytorch.py
@@ -17,7 +17,7 @@
 import warnings
 from abc import ABC
 from copy import deepcopy
-from typing import Any, Optional, TYPE_CHECKING, Union
+from typing import Any, TYPE_CHECKING
 
 import torch
 from botorch.acquisition.objective import PosteriorTransform
@@ -66,7 +66,7 @@ class (e.g. an `ExactGP`) and this `GPyTorchModel`. See e.g. `SingleTaskGP`.
 
     @staticmethod
     def _validate_tensor_args(
-        X: Tensor, Y: Tensor, Yvar: Optional[Tensor] = None, strict: bool = True
+        X: Tensor, Y: Tensor, Yvar: Tensor | None = None, strict: bool = True
     ) -> None:
         r"""Checks that `Y` and `Yvar` have an explicit output dimension if strict.
         Checks that the dtypes of the inputs match, and warns if using float.
@@ -156,10 +156,10 @@ def num_outputs(self) -> int:
     def posterior(
         self,
         X: Tensor,
-        observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        observation_noise: bool | Tensor = False,
+        posterior_transform: PosteriorTransform | None = None,
         **kwargs: Any,
-    ) -> Union[GPyTorchPosterior, TransformedPosterior]:
+    ) -> GPyTorchPosterior | TransformedPosterior:
         r"""Computes the posterior over model outputs at the provided points.
 
         Args:
@@ -204,7 +204,7 @@ def posterior(
         return posterior
 
     def condition_on_observations(
-        self, X: Tensor, Y: Tensor, noise: Optional[Tensor] = None, **kwargs: Any
+        self, X: Tensor, Y: Tensor, noise: Tensor | None = None, **kwargs: Any
     ) -> Model:
         r"""Condition the model on new observations.
 
@@ -325,8 +325,8 @@ def batch_shape(self) -> torch.Size:
         return self._input_batch_shape
 
     def _transform_tensor_args(
-        self, X: Tensor, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Tensor, Optional[Tensor]]:
+        self, X: Tensor, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor, Tensor | None]:
         r"""Transforms tensor arguments: for single output models, the output
         dimension is squeezed and for multi-output models, the output dimension is
         transformed into the left-most batch dimension.
@@ -358,7 +358,7 @@ def _apply_noise(
         self,
         X: Tensor,
         mvn: MultivariateNormal,
-        observation_noise: Union[bool, Tensor] = False,
+        observation_noise: bool | Tensor = False,
     ) -> MultivariateNormal:
         """Adds the observation noise to the posterior.
 
@@ -405,10 +405,10 @@ def _apply_noise(
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
-        observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
-    ) -> Union[GPyTorchPosterior, TransformedPosterior]:
+        output_indices: list[int] | None = None,
+        observation_noise: bool | Tensor = False,
+        posterior_transform: PosteriorTransform | None = None,
+    ) -> GPyTorchPosterior | TransformedPosterior:
         r"""Computes the posterior over model outputs at the provided points.
 
         Args:
@@ -631,10 +631,10 @@ def batch_shape(self) -> torch.Size:
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
-        observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
-    ) -> Union[GPyTorchPosterior, PosteriorList]:
+        output_indices: list[int] | None = None,
+        observation_noise: bool | Tensor = False,
+        posterior_transform: PosteriorTransform | None = None,
+    ) -> GPyTorchPosterior | PosteriorList:
         r"""Computes the posterior over model outputs at the provided points.
         If any model returns a MultitaskMultivariateNormal posterior, then that
         will be split into individual MVNs per task, with inter-task covariance
@@ -776,7 +776,7 @@ def _apply_noise(
         X: Tensor,
         mvn: MultivariateNormal,
         num_outputs: int,
-        observation_noise: Union[bool, Tensor],
+        observation_noise: bool | Tensor,
     ) -> MultivariateNormal:
         """Adds the observation noise to the posterior.
 
@@ -836,10 +836,10 @@ def _apply_noise(
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
-        observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
-    ) -> Union[GPyTorchPosterior, TransformedPosterior]:
+        output_indices: list[int] | None = None,
+        observation_noise: bool | Tensor = False,
+        posterior_transform: PosteriorTransform | None = None,
+    ) -> GPyTorchPosterior | TransformedPosterior:
         r"""Computes the posterior over model outputs at the provided points.
 
         Args:
diff --git a/botorch/models/higher_order_gp.py b/botorch/models/higher_order_gp.py
index b3f1b93d91..27bb54134d 100644
--- a/botorch/models/higher_order_gp.py
+++ b/botorch/models/higher_order_gp.py
@@ -16,7 +16,7 @@
 
 import warnings
 from contextlib import ExitStack
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from botorch.acquisition.objective import PosteriorTransform
@@ -64,7 +64,7 @@ class FlattenedStandardize(Standardize):
     def __init__(
         self,
         output_shape: torch.Size,
-        batch_shape: Optional[torch.Size] = None,
+        batch_shape: torch.Size | None = None,
         min_stdv: float = 1e-8,
     ):
         r"""
@@ -77,9 +77,7 @@ def __init__(
         if batch_shape is None:
             batch_shape = torch.Size()
 
-        super(FlattenedStandardize, self).__init__(
-            m=1, outputs=None, batch_shape=batch_shape, min_stdv=min_stdv
-        )
+        super().__init__(m=1, outputs=None, batch_shape=batch_shape, min_stdv=min_stdv)
 
         self.output_shape = output_shape
         self.batch_shape = batch_shape
@@ -93,8 +91,8 @@ def _return_to_output_shape(self, tsr: Tensor) -> Tensor:
         return out
 
     def forward(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         Y = self._squeeze_to_single_output(Y)
         if Yvar is not None:
             Yvar = self._squeeze_to_single_output(Yvar)
@@ -109,8 +107,8 @@ def forward(
         return Y_out, Yvar_out
 
     def untransform(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         Y = self._squeeze_to_single_output(Y)
         if Yvar is not None:
             Yvar = self._squeeze_to_single_output(Yvar)
@@ -180,13 +178,13 @@ def __init__(
         self,
         train_X: Tensor,
         train_Y: Tensor,
-        likelihood: Optional[Likelihood] = None,
-        covar_modules: Optional[list[Kernel]] = None,
-        num_latent_dims: Optional[list[int]] = None,
+        likelihood: Likelihood | None = None,
+        covar_modules: list[Kernel] | None = None,
+        num_latent_dims: list[int] | None = None,
         learn_latent_pars: bool = True,
         latent_init: str = "default",
-        outcome_transform: Union[OutcomeTransform, _DefaultType, None] = DEFAULT,
-        input_transform: Optional[InputTransform] = None,
+        outcome_transform: OutcomeTransform | _DefaultType | None = DEFAULT,
+        input_transform: InputTransform | None = None,
     ):
         r"""
         Args:
@@ -390,7 +388,7 @@ def get_fantasy_model(self, inputs, targets, **kwargs):
         return super().get_fantasy_model(inputs, reshaped_targets, **kwargs)
 
     def condition_on_observations(
-        self, X: Tensor, Y: Tensor, noise: Optional[torch.Tensor] = None, **kwargs: Any
+        self, X: Tensor, Y: Tensor, noise: torch.Tensor | None = None, **kwargs: Any
     ) -> HigherOrderGP:
         r"""Condition the model on new observations.
 
@@ -439,9 +437,9 @@ def condition_on_observations(
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
-        observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        output_indices: list[int] | None = None,
+        observation_noise: bool | Tensor = False,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> GPyTorchPosterior:
         self.eval()  # make sure we're calling a posterior
 
diff --git a/botorch/models/kernels/contextual_lcea.py b/botorch/models/kernels/contextual_lcea.py
index df30cc6a0d..8dafbaca59 100644
--- a/botorch/models/kernels/contextual_lcea.py
+++ b/botorch/models/kernels/contextual_lcea.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
@@ -41,7 +41,7 @@ def is_contiguous(indices: list[int]) -> bool:
     return set(indices) == set(range(min_idx, min_idx + len(indices)))
 
 
-def get_permutation(decomposition: dict[str, list[int]]) -> Optional[list[int]]:
+def get_permutation(decomposition: dict[str, list[int]]) -> list[int] | None:
     """Construct permutation to reorder the parameters such that:
 
     1) the parameters for each context are contiguous.
@@ -98,11 +98,11 @@ def __init__(
         decomposition: dict[str, list[int]],
         batch_shape: torch.Size,
         train_embedding: bool = True,
-        cat_feature_dict: Optional[dict] = None,
-        embs_feature_dict: Optional[dict] = None,
-        embs_dim_list: Optional[list[int]] = None,
-        context_weight_dict: Optional[dict] = None,
-        device: Optional[torch.device] = None,
+        cat_feature_dict: dict | None = None,
+        embs_feature_dict: dict | None = None,
+        embs_dim_list: list[int] | None = None,
+        context_weight_dict: dict | None = None,
+        device: torch.device | None = None,
     ) -> None:
         r"""
         Args:
@@ -190,7 +190,7 @@ def __init__(
         self.register_constraint("raw_outputscale_list", Positive())
 
     @property
-    def device(self) -> Optional[torch.device]:
+    def device(self) -> torch.device | None:
         return self._device
 
     @property
@@ -212,9 +212,9 @@ def _set_outputscale_list(self, value: Tensor) -> None:
 
     def _set_context_features(
         self,
-        cat_feature_dict: Optional[dict] = None,
-        embs_feature_dict: Optional[dict] = None,
-        embs_dim_list: Optional[list[int]] = None,
+        cat_feature_dict: dict | None = None,
+        embs_feature_dict: dict | None = None,
+        embs_dim_list: list[int] | None = None,
     ) -> None:
         """Set context categorical features and continuous embedding features.
         If cat_feature_dict is None, context indices will be used; If embs_dim_list
diff --git a/botorch/models/kernels/contextual_sac.py b/botorch/models/kernels/contextual_sac.py
index dbd09b344f..5e0a3aebfa 100644
--- a/botorch/models/kernels/contextual_sac.py
+++ b/botorch/models/kernels/contextual_sac.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from botorch.models.utils.gpytorch_modules import get_covar_module_with_dim_scaled_prior
@@ -44,7 +44,7 @@ def __init__(
         self,
         decomposition: dict[str, list[int]],
         batch_shape: torch.Size,
-        device: Optional[torch.device] = None,
+        device: torch.device | None = None,
     ) -> None:
         r"""
         Args:
@@ -85,7 +85,7 @@ def __init__(
         self.kernel_dict = ModuleDict(self.kernel_dict)
 
     @property
-    def device(self) -> Optional[torch.device]:
+    def device(self) -> torch.device | None:
         return self._device
 
     def forward(
diff --git a/botorch/models/kernels/downsampling.py b/botorch/models/kernels/downsampling.py
index 17a7d77c28..768fe1e881 100644
--- a/botorch/models/kernels/downsampling.py
+++ b/botorch/models/kernels/downsampling.py
@@ -6,7 +6,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from gpytorch.constraints import Interval, Positive
@@ -29,10 +28,10 @@ class DownsamplingKernel(Kernel):
 
     def __init__(
         self,
-        power_prior: Optional[Prior] = None,
-        offset_prior: Optional[Prior] = None,
-        power_constraint: Optional[Interval] = None,
-        offset_constraint: Optional[Interval] = None,
+        power_prior: Prior | None = None,
+        offset_prior: Prior | None = None,
+        power_constraint: Interval | None = None,
+        offset_constraint: Interval | None = None,
         **kwargs,
     ):
         r"""
@@ -110,8 +109,8 @@ def forward(
         self,
         x1: Tensor,
         x2: Tensor,
-        diag: Optional[bool] = False,
-        last_dim_is_batch: Optional[bool] = False,
+        diag: bool | None = False,
+        last_dim_is_batch: bool | None = False,
         **params,
     ) -> Tensor:
         offset = self.offset
diff --git a/botorch/models/kernels/exponential_decay.py b/botorch/models/kernels/exponential_decay.py
index 172caf383e..cfe49e2028 100644
--- a/botorch/models/kernels/exponential_decay.py
+++ b/botorch/models/kernels/exponential_decay.py
@@ -6,7 +6,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from gpytorch.constraints import Interval, Positive
@@ -31,10 +30,10 @@ class ExponentialDecayKernel(Kernel):
 
     def __init__(
         self,
-        power_prior: Optional[Prior] = None,
-        offset_prior: Optional[Prior] = None,
-        power_constraint: Optional[Interval] = None,
-        offset_constraint: Optional[Interval] = None,
+        power_prior: Prior | None = None,
+        offset_prior: Prior | None = None,
+        power_constraint: Interval | None = None,
+        offset_constraint: Interval | None = None,
         **kwargs,
     ):
         r"""
diff --git a/botorch/models/kernels/infinite_width_bnn.py b/botorch/models/kernels/infinite_width_bnn.py
index a3eefe2479..dc3c062836 100644
--- a/botorch/models/kernels/infinite_width_bnn.py
+++ b/botorch/models/kernels/infinite_width_bnn.py
@@ -6,7 +6,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from gpytorch.constraints import Positive
@@ -36,10 +35,10 @@ class InfiniteWidthBNNKernel(Kernel):
     def __init__(
         self,
         depth: int = 3,
-        batch_shape: Optional[torch.Size] = None,
-        active_dims: Optional[tuple[int, ...]] = None,
+        batch_shape: torch.Size | None = None,
+        active_dims: tuple[int, ...] | None = None,
         acos_eps: float = 1e-7,
-        device: Optional[torch.device] = None,
+        device: torch.device | None = None,
     ) -> None:
         r"""
         Args:
@@ -157,8 +156,8 @@ def forward(
         self,
         x1: Tensor,
         x2: Tensor,
-        diag: Optional[bool] = False,
-        last_dim_is_batch: Optional[bool] = False,
+        diag: bool | None = False,
+        last_dim_is_batch: bool | None = False,
         **params,
     ) -> Tensor:
         """
diff --git a/botorch/models/kernels/linear_truncated_fidelity.py b/botorch/models/kernels/linear_truncated_fidelity.py
index c5187ebe80..51c2504f8a 100644
--- a/botorch/models/kernels/linear_truncated_fidelity.py
+++ b/botorch/models/kernels/linear_truncated_fidelity.py
@@ -6,7 +6,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from botorch.exceptions import UnsupportedError
@@ -55,16 +55,16 @@ class LinearTruncatedFidelityKernel(Kernel):
     def __init__(  # noqa C901
         self,
         fidelity_dims: list[int],
-        dimension: Optional[int] = None,
-        power_prior: Optional[Prior] = None,
-        power_constraint: Optional[Interval] = None,
+        dimension: int | None = None,
+        power_prior: Prior | None = None,
+        power_constraint: Interval | None = None,
         nu: float = 2.5,
-        lengthscale_prior_unbiased: Optional[Prior] = None,
-        lengthscale_prior_biased: Optional[Prior] = None,
-        lengthscale_constraint_unbiased: Optional[Interval] = None,
-        lengthscale_constraint_biased: Optional[Interval] = None,
-        covar_module_unbiased: Optional[Kernel] = None,
-        covar_module_biased: Optional[Kernel] = None,
+        lengthscale_prior_unbiased: Prior | None = None,
+        lengthscale_prior_biased: Prior | None = None,
+        lengthscale_constraint_unbiased: Interval | None = None,
+        lengthscale_constraint_biased: Interval | None = None,
+        covar_module_unbiased: Kernel | None = None,
+        covar_module_biased: Kernel | None = None,
         **kwargs: Any,
     ) -> None:
         """
diff --git a/botorch/models/kernels/orthogonal_additive_kernel.py b/botorch/models/kernels/orthogonal_additive_kernel.py
index 0a6356425f..8922bda942 100644
--- a/botorch/models/kernels/orthogonal_additive_kernel.py
+++ b/botorch/models/kernels/orthogonal_additive_kernel.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
-from typing import Optional
 
 import numpy
 import torch
@@ -42,13 +41,13 @@ def __init__(
         dim: int,
         quad_deg: int = 32,
         second_order: bool = False,
-        batch_shape: Optional[torch.Size] = None,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
+        batch_shape: torch.Size | None = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
         coeff_constraint: Interval = _positivity_constraint,
-        offset_prior: Optional[Prior] = None,
-        coeffs_1_prior: Optional[Prior] = None,
-        coeffs_2_prior: Optional[Prior] = None,
+        offset_prior: Prior | None = None,
+        coeffs_1_prior: Prior | None = None,
+        coeffs_2_prior: Prior | None = None,
     ):
         """
         Args:
@@ -160,7 +159,7 @@ def coeffs_1(self) -> Tensor:
         return self.coeff_constraint.transform(self.raw_coeffs_1)
 
     @property
-    def coeffs_2(self) -> Optional[Tensor]:
+    def coeffs_2(self) -> Tensor | None:
         """Returns the upper-triangular tensor of second-order coefficients.
 
         NOTE: We only keep track of the upper triangular part of raw second order
@@ -304,8 +303,8 @@ def leggauss(
     deg: int,
     a: float = -1.0,
     b: float = 1.0,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
 ) -> tuple[Tensor, Tensor]:
     """Computes Gauss-Legendre quadrature nodes and weights. Wraps
     `numpy.polynomial.legendre.leggauss` and returns Torch Tensors.
diff --git a/botorch/models/model.py b/botorch/models/model.py
index da82b65ba7..1c36aee1ad 100644
--- a/botorch/models/model.py
+++ b/botorch/models/model.py
@@ -15,8 +15,8 @@
 import warnings
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from collections.abc import Mapping
-from typing import Any, Callable, Optional, TYPE_CHECKING, Union
+from collections.abc import Callable, Mapping
+from typing import Any, TYPE_CHECKING
 
 import numpy as np
 import torch
@@ -73,7 +73,7 @@ class Model(Module, ABC):
     """  # noqa: E501
 
     _has_transformed_inputs: bool = False
-    _original_train_inputs: Optional[Tensor] = None
+    _original_train_inputs: Tensor | None = None
     _is_fully_bayesian = False
     _is_ensemble = False
 
@@ -81,9 +81,9 @@ class Model(Module, ABC):
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
-        observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        output_indices: list[int] | None = None,
+        observation_noise: bool | Tensor = False,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> Posterior:
         r"""Computes the posterior over model outputs at the provided points.
 
@@ -174,7 +174,7 @@ def condition_on_observations(self, X: Tensor, Y: Tensor, **kwargs: Any) -> Mode
     def construct_inputs(
         cls,
         training_data: SupervisedDataset,
-    ) -> dict[str, Union[BotorchContainer, Tensor]]:
+    ) -> dict[str, BotorchContainer | Tensor]:
         """
         Construct `Model` keyword arguments from a `SupervisedDataset`.
 
@@ -199,7 +199,7 @@ def construct_inputs(
     def transform_inputs(
         self,
         X: Tensor,
-        input_transform: Optional[Module] = None,
+        input_transform: Module | None = None,
     ) -> Tensor:
         r"""Transform inputs.
 
@@ -310,7 +310,7 @@ def posterior(
     def transform_inputs(
         self,
         X: Tensor,
-        input_transform: Optional[Module] = None,
+        input_transform: Module | None = None,
     ) -> Tensor:
         """
         Classes that inherit from `FantasizeMixin` must implement
@@ -321,7 +321,7 @@ def fantasize(
         self,
         X: Tensor,
         sampler: MCSampler,
-        observation_noise: Optional[Tensor] = None,
+        observation_noise: Tensor | None = None,
         **kwargs: Any,
     ) -> Self:
         r"""Construct a fantasy model.
@@ -424,9 +424,7 @@ def __init__(self, *models: Model) -> None:
         super().__init__()
         self.models = ModuleList(models)
 
-    def _get_group_subset_indices(
-        self, idcs: Optional[list[int]]
-    ) -> dict[int, list[int]]:
+    def _get_group_subset_indices(self, idcs: list[int] | None) -> dict[int, list[int]]:
         r"""Convert global subset indices to indices for the individual models.
 
         Args:
@@ -452,9 +450,9 @@ def _get_group_subset_indices(
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
-        observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[Callable[[PosteriorList], Posterior]] = None,
+        output_indices: list[int] | None = None,
+        observation_noise: bool | Tensor = False,
+        posterior_transform: Callable[[PosteriorList], Posterior] | None = None,
     ) -> Posterior:
         r"""Computes the posterior over model outputs at the provided points.
 
@@ -598,8 +596,8 @@ def fantasize(
         self,
         X: Tensor,
         sampler: MCSampler,
-        observation_noise: Optional[Tensor] = None,
-        evaluation_mask: Optional[Tensor] = None,
+        observation_noise: Tensor | None = None,
+        evaluation_mask: Tensor | None = None,
         **kwargs: Any,
     ) -> Model:
         r"""Construct a fantasy model.
diff --git a/botorch/models/multitask.py b/botorch/models/multitask.py
index 092639f250..9b533c856c 100644
--- a/botorch/models/multitask.py
+++ b/botorch/models/multitask.py
@@ -30,7 +30,7 @@
 from __future__ import annotations
 
 import math
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from botorch.acquisition.objective import PosteriorTransform
@@ -82,9 +82,7 @@
 from torch import Tensor
 
 
-def get_task_value_remapping(
-    task_values: Tensor, dtype: torch.dtype
-) -> Optional[Tensor]:
+def get_task_value_remapping(task_values: Tensor, dtype: torch.dtype) -> Tensor | None:
     """Construct an mapping of discrete task values to contiguous int-valued floats.
 
     Args:
@@ -140,16 +138,16 @@ def __init__(
         train_X: Tensor,
         train_Y: Tensor,
         task_feature: int,
-        train_Yvar: Optional[Tensor] = None,
-        mean_module: Optional[Module] = None,
-        covar_module: Optional[Module] = None,
-        likelihood: Optional[Likelihood] = None,
-        task_covar_prior: Optional[Prior] = None,
-        output_tasks: Optional[list[int]] = None,
-        rank: Optional[int] = None,
-        all_tasks: Optional[list[int]] = None,
-        outcome_transform: Optional[Union[OutcomeTransform, _DefaultType]] = DEFAULT,
-        input_transform: Optional[InputTransform] = None,
+        train_Yvar: Tensor | None = None,
+        mean_module: Module | None = None,
+        covar_module: Module | None = None,
+        likelihood: Likelihood | None = None,
+        task_covar_prior: Prior | None = None,
+        output_tasks: list[int] | None = None,
+        rank: int | None = None,
+        all_tasks: list[int] | None = None,
+        outcome_transform: OutcomeTransform | _DefaultType | None = DEFAULT,
+        input_transform: InputTransform | None = None,
     ) -> None:
         r"""Multi-Task GP model using an ICM kernel.
 
@@ -319,7 +317,7 @@ def get_all_tasks(
         cls,
         train_X: Tensor,
         task_feature: int,
-        output_tasks: Optional[list[int]] = None,
+        output_tasks: list[int] | None = None,
     ) -> tuple[list[int], int, int]:
         if train_X.ndim != 2:
             # Currently, batch mode MTGPs are blocked upstream in GPyTorch
@@ -337,12 +335,12 @@ def get_all_tasks(
     @classmethod
     def construct_inputs(
         cls,
-        training_data: Union[SupervisedDataset, MultiTaskDataset],
+        training_data: SupervisedDataset | MultiTaskDataset,
         task_feature: int,
-        output_tasks: Optional[list[int]] = None,
-        task_covar_prior: Optional[Prior] = None,
-        prior_config: Optional[dict] = None,
-        rank: Optional[int] = None,
+        output_tasks: list[int] | None = None,
+        task_covar_prior: Prior | None = None,
+        prior_config: dict | None = None,
+        rank: int | None = None,
     ) -> dict[str, Any]:
         r"""Construct `Model` keyword arguments from a dataset and other args.
 
@@ -429,12 +427,12 @@ def __init__(
         self,
         train_X: Tensor,
         train_Y: Tensor,
-        likelihood: Optional[MultitaskGaussianLikelihood] = None,
-        data_covar_module: Optional[Module] = None,
-        task_covar_prior: Optional[Prior] = None,
-        rank: Optional[int] = None,
-        input_transform: Optional[InputTransform] = None,
-        outcome_transform: Optional[OutcomeTransform] = None,
+        likelihood: MultitaskGaussianLikelihood | None = None,
+        data_covar_module: Module | None = None,
+        task_covar_prior: Prior | None = None,
+        rank: int | None = None,
+        input_transform: InputTransform | None = None,
+        outcome_transform: OutcomeTransform | None = None,
         **kwargs: Any,
     ) -> None:
         r"""
@@ -569,9 +567,9 @@ def predictive_mean_cache(self):
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
-        observation_noise: Union[bool, Tensor] = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        output_indices: list[int] | None = None,
+        observation_noise: bool | Tensor = False,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> MultitaskGPPosterior:
         self.eval()
 
diff --git a/botorch/models/pairwise_gp.py b/botorch/models/pairwise_gp.py
index e45f058755..6caccdd2d9 100644
--- a/botorch/models/pairwise_gp.py
+++ b/botorch/models/pairwise_gp.py
@@ -23,7 +23,7 @@
 import warnings
 from collections.abc import Iterable
 from copy import deepcopy
-from typing import Any, Optional, Union
+from typing import Any
 
 import numpy as np
 import torch
@@ -74,7 +74,7 @@ def _check_strict_input(
 
 
 def _scaled_psd_safe_cholesky(
-    matrix: Tensor, scale: Tensor, jitter: Optional[float] = None
+    matrix: Tensor, scale: Tensor, jitter: float | None = None
 ) -> Tensor:
     r"""scale matrix by 1/outputscale before cholesky for better numerical stability"""
     matrix = matrix / scale
@@ -85,7 +85,7 @@ def _scaled_psd_safe_cholesky(
 
 def _ensure_psd_with_jitter(
     matrix: Tensor,
-    scale: Union[float, Tensor] = 1.0,
+    scale: float | Tensor = 1.0,
     jitter: float = 1e-8,
     max_tries: int = 3,
 ) -> Tensor:
@@ -163,17 +163,17 @@ class PairwiseGP(Model, GP, FantasizeMixin):
 
     def __init__(
         self,
-        datapoints: Optional[Tensor],
-        comparisons: Optional[Tensor],
-        likelihood: Optional[PairwiseLikelihood] = None,
-        covar_module: Optional[ScaleKernel] = None,
-        input_transform: Optional[InputTransform] = None,
+        datapoints: Tensor | None,
+        comparisons: Tensor | None,
+        likelihood: PairwiseLikelihood | None = None,
+        covar_module: ScaleKernel | None = None,
+        input_transform: InputTransform | None = None,
         *,
         jitter: float = 1e-6,
-        xtol: Optional[float] = None,
+        xtol: float | None = None,
         consolidate_rtol: float = 0.0,
         consolidate_atol: float = 1e-4,
-        maxfev: Optional[int] = None,
+        maxfev: int | None = None,
     ) -> None:
         r"""
         Args:
@@ -340,7 +340,7 @@ def _has_no_data(self):
             or self.comparisons is None
         )
 
-    def _calc_covar(self, X1: Tensor, X2: Tensor) -> Union[Tensor, LinearOperator]:
+    def _calc_covar(self, X1: Tensor, X2: Tensor) -> Tensor | LinearOperator:
         r"""Calculate the covariance matrix given two sets of datapoints"""
         covar = self.covar_module(X1, X2).to_dense()
         # making sure covar is PSD when it's a covariance matrix
@@ -370,7 +370,7 @@ def _update_covar(self, datapoints: Tensor) -> None:
         )
         self.covar_inv = torch.cholesky_inverse(self.covar_chol)
 
-    def _prior_mean(self, X: Tensor) -> Union[Tensor, LinearOperator]:
+    def _prior_mean(self, X: Tensor) -> Tensor | LinearOperator:
         r"""Return point prediction using prior only
 
         Args:
@@ -397,13 +397,13 @@ def _prior_predict(self, X: Tensor) -> tuple[Tensor, Tensor]:
 
     def _grad_posterior_f(
         self,
-        utility: Union[Tensor, np.ndarray],
+        utility: Tensor | np.ndarray,
         datapoints: Tensor,
         D: Tensor,
         covar_chol: Tensor,
-        covar_inv: Optional[Tensor] = None,
+        covar_inv: Tensor | None = None,
         ret_np: bool = False,
-    ) -> Union[Tensor, np.ndarray]:
+    ) -> Tensor | np.ndarray:
         r"""Compute the gradient of S loss wrt to f/utility in [Chu2005preference]_.
 
         For finding f_map, which is negative of the log posterior, i.e., -log(p(f|D))
@@ -441,13 +441,13 @@ def _grad_posterior_f(
 
     def _hess_posterior_f(
         self,
-        utility: Union[Tensor, np.ndarray],
+        utility: Tensor | np.ndarray,
         datapoints: Tensor,
         D: Tensor,
         covar_chol: Tensor,
         covar_inv: Tensor,
         ret_np: bool = False,
-    ) -> Union[Tensor, np.ndarray]:
+    ) -> Tensor | np.ndarray:
         r"""Compute the hessian of S loss wrt utility for finding f_map.
 
         which is negative of the log posterior, i.e., -log(p(f|D))
@@ -651,7 +651,7 @@ def _transform_batch_shape(self, X: Tensor, X_new: Tensor) -> tuple[Tensor, Tens
             return X.expand(X_new_bs + X.shape[-2:]), X_new
 
     def _util_newton_updates(
-        self, dp: Tensor, x0: Tensor, max_iter: int = 1, xtol: Optional[float] = None
+        self, dp: Tensor, x0: Tensor, max_iter: int = 1, xtol: float | None = None
     ) -> Tensor:
         r"""Make `max_iter` newton updates on utility.
 
@@ -811,8 +811,8 @@ def construct_inputs(
 
     def set_train_data(
         self,
-        datapoints: Optional[Tensor] = None,
-        comparisons: Optional[Tensor] = None,
+        datapoints: Tensor | None = None,
+        comparisons: Tensor | None = None,
         strict: bool = False,
         update_model: bool = True,
     ) -> None:
@@ -1068,9 +1068,9 @@ def forward(self, datapoints: Tensor) -> MultivariateNormal:
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
+        output_indices: list[int] | None = None,
         observation_noise: bool = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> Posterior:
         r"""Computes the posterior over model outputs at the provided points.
 
diff --git a/botorch/models/transforms/factory.py b/botorch/models/transforms/factory.py
index faaca1d019..2dd5b164a7 100644
--- a/botorch/models/transforms/factory.py
+++ b/botorch/models/transforms/factory.py
@@ -7,7 +7,6 @@
 from __future__ import annotations
 
 from collections import OrderedDict
-from typing import Optional
 
 from botorch.models.transforms.input import (
     ChainedInputTransform,
@@ -20,8 +19,8 @@
 
 def get_rounding_input_transform(
     one_hot_bounds: Tensor,
-    integer_indices: Optional[list[int]] = None,
-    categorical_features: Optional[dict[int, int]] = None,
+    integer_indices: list[int] | None = None,
+    categorical_features: dict[int, int] | None = None,
     initialization: bool = False,
     return_numeric: bool = False,
     approximate: bool = False,
diff --git a/botorch/models/transforms/input.py b/botorch/models/transforms/input.py
index 74bbafc191..ac6d6637fe 100644
--- a/botorch/models/transforms/input.py
+++ b/botorch/models/transforms/input.py
@@ -17,7 +17,8 @@
 
 from abc import ABC, abstractmethod
 from collections import OrderedDict
-from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
+from collections.abc import Callable, Iterable
+from typing import Any
 from warnings import warn
 
 import numpy as np
@@ -160,7 +161,7 @@ class BatchBroadcastedInputTransform(InputTransform, ModuleDict):
 
     def __init__(
         self,
-        transforms: List[InputTransform],
+        transforms: list[InputTransform],
         broadcast_index: int = -3,
     ) -> None:
         r"""A transform list that is broadcasted across a batch dimension specified by
@@ -266,7 +267,7 @@ def preprocess_transform(self, X: Tensor) -> Tensor:
             dim=self.broadcast_index,
         )
 
-    def _Xs_and_transforms(self, X: Tensor) -> Iterable[Tuple[Tensor, InputTransform]]:
+    def _Xs_and_transforms(self, X: Tensor) -> Iterable[tuple[Tensor, InputTransform]]:
         r"""Returns an iterable of sub-tensors of X and their associated transforms.
 
         Args:
@@ -446,7 +447,7 @@ def __init__(
         d: int,
         coefficient: Tensor,
         offset: Tensor,
-        indices: Optional[Union[list[int], Tensor]] = None,
+        indices: list[int] | Tensor | None = None,
         batch_shape: torch.Size = torch.Size(),  # noqa: B008
         transform_on_train: bool = True,
         transform_on_eval: bool = True,
@@ -627,15 +628,15 @@ class Normalize(AffineInputTransform):
     def __init__(
         self,
         d: int,
-        indices: Optional[Union[list[int], Tensor]] = None,
-        bounds: Optional[Tensor] = None,
+        indices: list[int] | Tensor | None = None,
+        bounds: Tensor | None = None,
         batch_shape: torch.Size = torch.Size(),  # noqa: B008
         transform_on_train: bool = True,
         transform_on_eval: bool = True,
         transform_on_fantasize: bool = True,
         reverse: bool = False,
         min_range: float = 1e-8,
-        learn_bounds: Optional[bool] = None,
+        learn_bounds: bool | None = None,
         almost_zero: float = 1e-12,
     ) -> None:
         r"""Normalize the inputs to the unit cube.
@@ -776,7 +777,7 @@ class InputStandardize(AffineInputTransform):
     def __init__(
         self,
         d: int,
-        indices: Optional[Union[list[int], Tensor]] = None,
+        indices: list[int] | Tensor | None = None,
         batch_shape: torch.Size = torch.Size(),  # noqa: B008
         transform_on_train: bool = True,
         transform_on_eval: bool = True,
@@ -898,8 +899,8 @@ class Round(InputTransform, Module):
 
     def __init__(
         self,
-        integer_indices: Union[list[int], LongTensor, None] = None,
-        categorical_features: Optional[dict[int, int]] = None,
+        integer_indices: list[int] | LongTensor | None = None,
+        categorical_features: dict[int, int] | None = None,
         transform_on_train: bool = True,
         transform_on_eval: bool = True,
         transform_on_fantasize: bool = True,
@@ -1073,9 +1074,9 @@ def __init__(
         transform_on_fantasize: bool = True,
         reverse: bool = False,
         eps: float = 1e-7,
-        concentration1_prior: Optional[Prior] = None,
-        concentration0_prior: Optional[Prior] = None,
-        batch_shape: Optional[torch.Size] = None,
+        concentration1_prior: Prior | None = None,
+        concentration0_prior: Prior | None = None,
+        batch_shape: torch.Size | None = None,
     ) -> None:
         r"""Initialize transform.
 
@@ -1146,7 +1147,7 @@ def __init__(
             )
             self.register_constraint(param_name=p_name, constraint=constraint)
 
-    def _set_concentration(self, i: int, value: Union[float, Tensor]) -> None:
+    def _set_concentration(self, i: int, value: float | Tensor) -> None:
         if not torch.is_tensor(value):
             value = torch.as_tensor(value).to(self.concentration0)
         self.initialize(**{f"concentration{i}": value})
@@ -1258,10 +1259,10 @@ class AppendFeatures(InputTransform, Module):
 
     def __init__(
         self,
-        feature_set: Optional[Tensor] = None,
-        f: Optional[Callable[[Tensor], Tensor]] = None,
-        indices: Optional[list[int]] = None,
-        fkwargs: Optional[dict[str, Any]] = None,
+        feature_set: Tensor | None = None,
+        f: Callable[[Tensor], Tensor] | None = None,
+        indices: list[int] | None = None,
+        fkwargs: dict[str, Any] | None = None,
         skip_expand: bool = False,
         transform_on_train: bool = False,
         transform_on_eval: bool = True,
@@ -1456,9 +1457,9 @@ class InputPerturbation(InputTransform, Module):
 
     def __init__(
         self,
-        perturbation_set: Union[Tensor, Callable[[Tensor], Tensor]],
-        bounds: Optional[Tensor] = None,
-        indices: Optional[list[int]] = None,
+        perturbation_set: Tensor | Callable[[Tensor], Tensor],
+        bounds: Tensor | None = None,
+        indices: list[int] | None = None,
         multiplicative: bool = False,
         transform_on_train: bool = False,
         transform_on_eval: bool = True,
@@ -1575,7 +1576,7 @@ class OneHotToNumeric(InputTransform, Module):
     def __init__(
         self,
         dim: int,
-        categorical_features: Optional[dict[int, int]] = None,
+        categorical_features: dict[int, int] | None = None,
         transform_on_train: bool = True,
         transform_on_eval: bool = True,
         transform_on_fantasize: bool = True,
diff --git a/botorch/models/transforms/outcome.py b/botorch/models/transforms/outcome.py
index 4eda2be865..6f93c668a4 100644
--- a/botorch/models/transforms/outcome.py
+++ b/botorch/models/transforms/outcome.py
@@ -24,7 +24,6 @@
 
 from abc import ABC, abstractmethod
 from collections import OrderedDict
-from typing import Optional, Union
 
 import torch
 from botorch.models.transforms.utils import (
@@ -43,8 +42,8 @@ class OutcomeTransform(Module, ABC):
 
     @abstractmethod
     def forward(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Transform the outcomes in a model's training targets
 
         Args:
@@ -78,8 +77,8 @@ def subset_output(self, idcs: list[int]) -> OutcomeTransform:
         )
 
     def untransform(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Un-transform previously transformed outcomes
 
         Args:
@@ -140,8 +139,8 @@ def __init__(self, **transforms: OutcomeTransform) -> None:
         super().__init__(OrderedDict(transforms))
 
     def forward(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Transform the outcomes in a model's training targets
 
         Args:
@@ -173,8 +172,8 @@ def subset_output(self, idcs: list[int]) -> OutcomeTransform:
         )
 
     def untransform(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Un-transform previously transformed outcomes
 
         Args:
@@ -198,7 +197,7 @@ def _is_linear(self) -> bool:
         A `ChainedOutcomeTransform` is linear only if all of the component transforms
         are linear.
         """
-        return all((octf._is_linear for octf in self.values()))
+        return all(octf._is_linear for octf in self.values())
 
     def untransform_posterior(self, posterior: Posterior) -> Posterior:
         r"""Un-transform a posterior
@@ -226,7 +225,7 @@ class Standardize(OutcomeTransform):
     def __init__(
         self,
         m: int,
-        outputs: Optional[list[int]] = None,
+        outputs: list[int] | None = None,
         batch_shape: torch.Size = torch.Size(),  # noqa: B008
         min_stdv: float = 1e-8,
     ) -> None:
@@ -251,8 +250,8 @@ def __init__(
         self._min_stdv = min_stdv
 
     def forward(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Standardize outcomes.
 
         If the module is in train mode, this updates the module state (i.e. the
@@ -340,8 +339,8 @@ def subset_output(self, idcs: list[int]) -> OutcomeTransform:
         return new_tf
 
     def untransform(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Un-standardize outcomes.
 
         Args:
@@ -372,7 +371,7 @@ def _is_linear(self) -> bool:
 
     def untransform_posterior(
         self, posterior: Posterior
-    ) -> Union[GPyTorchPosterior, TransformedPosterior]:
+    ) -> GPyTorchPosterior | TransformedPosterior:
         r"""Un-standardize the posterior.
 
         Args:
@@ -455,7 +454,7 @@ class Log(OutcomeTransform):
     log-transformed outcomes and un-transform the model posterior of that GP.
     """
 
-    def __init__(self, outputs: Optional[list[int]] = None) -> None:
+    def __init__(self, outputs: list[int] | None = None) -> None:
         r"""Log-transform outcomes.
 
         Args:
@@ -488,8 +487,8 @@ def subset_output(self, idcs: list[int]) -> OutcomeTransform:
         return new_tf
 
     def forward(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Log-transform outcomes.
 
         Args:
@@ -521,8 +520,8 @@ def forward(
         return Y_tf, Yvar
 
     def untransform(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Un-transform log-transformed outcomes
 
         Args:
@@ -583,7 +582,7 @@ class Power(OutcomeTransform):
     power-transformed outcomes and un-transform the model posterior of that GP.
     """
 
-    def __init__(self, power: float, outputs: Optional[list[int]] = None) -> None:
+    def __init__(self, power: float, outputs: list[int] | None = None) -> None:
         r"""Power-transform outcomes.
 
         Args:
@@ -617,8 +616,8 @@ def subset_output(self, idcs: list[int]) -> OutcomeTransform:
         return new_tf
 
     def forward(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Power-transform outcomes.
 
         Args:
@@ -650,8 +649,8 @@ def forward(
         return Y_tf, Yvar
 
     def untransform(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Un-transform power-transformed outcomes
 
         Args:
@@ -709,7 +708,7 @@ class Bilog(OutcomeTransform):
     constraints as it magnifies values near zero and flattens extreme values.
     """
 
-    def __init__(self, outputs: Optional[list[int]] = None) -> None:
+    def __init__(self, outputs: list[int] | None = None) -> None:
         r"""Bilog-transform outcomes.
 
         Args:
@@ -742,8 +741,8 @@ def subset_output(self, idcs: list[int]) -> OutcomeTransform:
         return new_tf
 
     def forward(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Bilog-transform outcomes.
 
         Args:
@@ -774,8 +773,8 @@ def forward(
         return Y_tf, Yvar
 
     def untransform(
-        self, Y: Tensor, Yvar: Optional[Tensor] = None
-    ) -> tuple[Tensor, Optional[Tensor]]:
+        self, Y: Tensor, Yvar: Tensor | None = None
+    ) -> tuple[Tensor, Tensor | None]:
         r"""Un-transform bilog-transformed outcomes
 
         Args:
diff --git a/botorch/models/utils/assorted.py b/botorch/models/utils/assorted.py
index b25a8f5e8c..fa146cb5dd 100644
--- a/botorch/models/utils/assorted.py
+++ b/botorch/models/utils/assorted.py
@@ -11,7 +11,6 @@
 import warnings
 from collections.abc import Iterator
 from contextlib import contextmanager, ExitStack
-from typing import Optional
 
 import torch
 from botorch import settings
@@ -49,8 +48,8 @@ def multioutput_to_batch_mode_transform(
     train_X: Tensor,
     train_Y: Tensor,
     num_outputs: int,
-    train_Yvar: Optional[Tensor] = None,
-) -> tuple[Tensor, Tensor, Optional[Tensor]]:
+    train_Yvar: Tensor | None = None,
+) -> tuple[Tensor, Tensor, Tensor | None]:
     r"""Transforms training inputs for a multi-output model.
 
     Used for multi-output models that internally are represented by a
@@ -138,7 +137,7 @@ def check_min_max_scaling(
     strict: bool = False,
     atol: float = 1e-2,
     raise_on_fail: bool = False,
-    ignore_dims: Optional[list[int]] = None,
+    ignore_dims: list[int] | None = None,
 ) -> None:
     r"""Check that tensor is normalized to the unit cube.
 
@@ -226,9 +225,9 @@ def check_standardization(
 def validate_input_scaling(
     train_X: Tensor,
     train_Y: Tensor,
-    train_Yvar: Optional[Tensor] = None,
+    train_Yvar: Tensor | None = None,
     raise_on_fail: bool = False,
-    ignore_X_dims: Optional[list[int]] = None,
+    ignore_X_dims: list[int] | None = None,
 ) -> None:
     r"""Helper function to validate input data to models.
 
diff --git a/botorch/models/utils/gpytorch_modules.py b/botorch/models/utils/gpytorch_modules.py
index 9f096d6bf7..bfe5da8551 100644
--- a/botorch/models/utils/gpytorch_modules.py
+++ b/botorch/models/utils/gpytorch_modules.py
@@ -17,8 +17,8 @@
     In International Conference on Machine Learning, 2024.
 """
 
+from collections.abc import Sequence
 from math import log, sqrt
-from typing import Optional, Sequence, Union
 
 import torch
 from gpytorch.constraints.constraints import GreaterThan
@@ -32,7 +32,7 @@
 
 
 def get_matern_kernel_with_gamma_prior(
-    ard_num_dims: int, batch_shape: Optional[torch.Size] = None
+    ard_num_dims: int, batch_shape: torch.Size | None = None
 ) -> ScaleKernel:
     r"""Constructs the Scale-Matern kernel that is used by default by
     several models. This uses a Gamma(3.0, 6.0) prior for the lengthscale
@@ -51,7 +51,7 @@ def get_matern_kernel_with_gamma_prior(
 
 
 def get_gaussian_likelihood_with_gamma_prior(
-    batch_shape: Optional[torch.Size] = None,
+    batch_shape: torch.Size | None = None,
 ) -> GaussianLikelihood:
     r"""Constructs the GaussianLikelihood that is used by default by
     several models. This uses a Gamma(1.1, 0.05) prior and constrains the
@@ -72,7 +72,7 @@ def get_gaussian_likelihood_with_gamma_prior(
 
 
 def get_gaussian_likelihood_with_lognormal_prior(
-    batch_shape: Optional[torch.Size] = None,
+    batch_shape: torch.Size | None = None,
 ) -> GaussianLikelihood:
     """Return Gaussian likelihood with a LogNormal(-4.0, 1.0) prior.
     This prior is based on [Hvarfner2024vanilla]_.
@@ -99,10 +99,10 @@ def get_gaussian_likelihood_with_lognormal_prior(
 
 def get_covar_module_with_dim_scaled_prior(
     ard_num_dims: int,
-    batch_shape: Optional[torch.Size] = None,
+    batch_shape: torch.Size | None = None,
     use_rbf_kernel: bool = True,
-    active_dims: Optional[Sequence[int]] = None,
-) -> Union[MaternKernel, RBFKernel]:
+    active_dims: Sequence[int] | None = None,
+) -> MaternKernel | RBFKernel:
     """Returns an RBF or Matern kernel with priors
     from  [Hvarfner2024vanilla]_.
 
diff --git a/botorch/models/utils/inducing_point_allocators.py b/botorch/models/utils/inducing_point_allocators.py
index b1608d619d..27b49ebc25 100644
--- a/botorch/models/utils/inducing_point_allocators.py
+++ b/botorch/models/utils/inducing_point_allocators.py
@@ -21,7 +21,6 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Union
 
 import torch
 from botorch.exceptions.errors import UnsupportedError
@@ -283,7 +282,7 @@ def _get_quality_function(
 
 def _pivoted_cholesky_init(
     train_inputs: Tensor,
-    kernel_matrix: Union[Tensor, LinearOperator],
+    kernel_matrix: Tensor | LinearOperator,
     max_length: int,
     quality_scores: Tensor,
     epsilon: float = 1e-6,
diff --git a/botorch/optim/closures/core.py b/botorch/optim/closures/core.py
index 01fa9085d0..7d64c04c45 100644
--- a/botorch/optim/closures/core.py
+++ b/botorch/optim/closures/core.py
@@ -8,10 +8,10 @@
 
 from __future__ import annotations
 
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 
 from functools import partial
-from typing import Any, Callable, Optional
+from typing import Any
 
 import torch
 from botorch.optim.utils import (
@@ -33,8 +33,8 @@ def __init__(
         forward: Callable[[], Tensor],
         parameters: dict[str, Tensor],
         backward: Callable[[Tensor], None] = Tensor.backward,
-        reducer: Optional[Callable[[Tensor], Tensor]] = torch.sum,
-        callback: Optional[Callable[[Tensor, Sequence[Optional[Tensor]]], None]] = None,
+        reducer: Callable[[Tensor], Tensor] | None = torch.sum,
+        callback: Callable[[Tensor, Sequence[Tensor | None]], None] | None = None,
         context_manager: Callable = None,  # pyre-ignore [9]
     ) -> None:
         r"""Initializes a ForwardBackwardClosure instance.
@@ -61,7 +61,7 @@ def __init__(
         self.callback = callback
         self.context_manager = context_manager
 
-    def __call__(self, **kwargs: Any) -> tuple[Tensor, tuple[Optional[Tensor], ...]]:
+    def __call__(self, **kwargs: Any) -> tuple[Tensor, tuple[Tensor | None, ...]]:
         with self.context_manager():
             values = self.forward(**kwargs)
             value = values if self.reducer is None else self.reducer(values)
@@ -80,7 +80,7 @@ class NdarrayOptimizationClosure:
 
     def __init__(
         self,
-        closure: Callable[[], tuple[Tensor, Sequence[Optional[Tensor]]]],
+        closure: Callable[[], tuple[Tensor, Sequence[Tensor | None]]],
         parameters: dict[str, Tensor],
         as_array: Callable[[Tensor], ndarray] = None,  # pyre-ignore [9]
         as_tensor: Callable[[ndarray], Tensor] = torch.as_tensor,
@@ -140,10 +140,10 @@ def __init__(
 
         self.fill_value = fill_value
         self.persistent = persistent
-        self._gradient_ndarray: Optional[ndarray] = None
+        self._gradient_ndarray: ndarray | None = None
 
     def __call__(
-        self, state: Optional[ndarray] = None, **kwargs: Any
+        self, state: ndarray | None = None, **kwargs: Any
     ) -> tuple[ndarray, ndarray]:
         if state is not None:
             self.state = state
@@ -171,7 +171,7 @@ def state(self) -> ndarray:
     def state(self, state: ndarray) -> None:
         self._set_state(state)
 
-    def _get_gradient_ndarray(self, fill_value: Optional[float] = None) -> ndarray:
+    def _get_gradient_ndarray(self, fill_value: float | None = None) -> ndarray:
         if self.persistent and self._gradient_ndarray is not None:
             if fill_value is not None:
                 self._gradient_ndarray.fill(fill_value)
diff --git a/botorch/optim/closures/model_closures.py b/botorch/optim/closures/model_closures.py
index c3fa36091b..44f93bf974 100644
--- a/botorch/optim/closures/model_closures.py
+++ b/botorch/optim/closures/model_closures.py
@@ -8,10 +8,10 @@
 
 from __future__ import annotations
 
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from itertools import chain, repeat
 from types import NoneType
-from typing import Any, Callable, Optional
+from typing import Any
 
 from botorch.optim.closures.core import ForwardBackwardClosure
 from botorch.utils.dispatcher import Dispatcher, type_bypassing_encoder
@@ -31,7 +31,7 @@
 
 def get_loss_closure(
     mll: MarginalLogLikelihood,
-    data_loader: Optional[DataLoader] = None,
+    data_loader: DataLoader | None = None,
     **kwargs: Any,
 ) -> Callable[[], Tensor]:
     r"""Public API for GetLossClosure dispatcher.
@@ -64,10 +64,10 @@ def get_loss_closure(
 def get_loss_closure_with_grads(
     mll: MarginalLogLikelihood,
     parameters: dict[str, Tensor],
-    data_loader: Optional[DataLoader] = None,
+    data_loader: DataLoader | None = None,
     backward: Callable[[Tensor], None] = Tensor.backward,
-    reducer: Optional[Callable[[Tensor], Tensor]] = Tensor.sum,
-    context_manager: Optional[Callable] = None,
+    reducer: Callable[[Tensor], Tensor] | None = Tensor.sum,
+    context_manager: Callable | None = None,
     **kwargs: Any,
 ) -> Callable[[], tuple[Tensor, tuple[Tensor, ...]]]:
     r"""Public API for GetLossClosureWithGrads dispatcher.
@@ -107,7 +107,7 @@ def _get_loss_closure_with_grads_fallback(
     mll: MarginalLogLikelihood,
     _likelihood_type: object,
     _model_type: object,
-    data_loader: Optional[DataLoader],
+    data_loader: DataLoader | None,
     parameters: dict[str, Tensor],
     reducer: Callable[[Tensor], Tensor] = Tensor.sum,
     backward: Callable[[Tensor], None] = Tensor.backward,
diff --git a/botorch/optim/core.py b/botorch/optim/core.py
index ea2ade6f16..8765a25e91 100644
--- a/botorch/optim/core.py
+++ b/botorch/optim/core.py
@@ -9,13 +9,13 @@
 from __future__ import annotations
 
 import re
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 from dataclasses import dataclass, replace
 from enum import auto, Enum
 from itertools import count
 from sys import maxsize
 from time import monotonic
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 from botorch.optim.closures import NdarrayOptimizationClosure
 from botorch.optim.utils.numpy_utils import get_bounds_as_ndarray
@@ -46,24 +46,24 @@ class OptimizationStatus(int, Enum):
 @dataclass
 class OptimizationResult:
     step: int
-    fval: Union[float, int]
+    fval: float | int
     status: OptimizationStatus
-    runtime: Optional[float] = None
-    message: Optional[str] = None
+    runtime: float | None = None
+    message: str | None = None
 
 
 def scipy_minimize(
-    closure: Union[
-        Callable[[], tuple[Tensor, Sequence[Optional[Tensor]]]],
-        NdarrayOptimizationClosure,
-    ],
+    closure: (
+        Callable[[], tuple[Tensor, Sequence[Tensor | None]]]
+        | NdarrayOptimizationClosure
+    ),
     parameters: dict[str, Tensor],
-    bounds: Optional[dict[str, tuple[Optional[float], Optional[float]]]] = None,
-    callback: Optional[Callable[[dict[str, Tensor], OptimizationResult], None]] = None,
-    x0: Optional[ndarray] = None,
+    bounds: dict[str, tuple[float | None, float | None]] | None = None,
+    callback: Callable[[dict[str, Tensor], OptimizationResult], None] | None = None,
+    x0: ndarray | None = None,
     method: str = "L-BFGS-B",
-    options: Optional[dict[str, Any]] = None,
-    timeout_sec: Optional[float] = None,
+    options: dict[str, Any] | None = None,
+    timeout_sec: float | None = None,
 ) -> OptimizationResult:
     r"""Generic scipy.optimize.minimize-based optimization routine.
 
@@ -141,15 +141,15 @@ def wrapped_callback(x: ndarray):
 
 
 def torch_minimize(
-    closure: Callable[[], tuple[Tensor, Sequence[Optional[Tensor]]]],
+    closure: Callable[[], tuple[Tensor, Sequence[Tensor | None]]],
     parameters: dict[str, Tensor],
-    bounds: Optional[dict[str, tuple[Optional[float], Optional[float]]]] = None,
-    callback: Optional[Callable[[dict[str, Tensor], OptimizationResult], None]] = None,
-    optimizer: Union[Optimizer, Callable[[list[Tensor]], Optimizer]] = Adam,
-    scheduler: Optional[Union[LRScheduler, Callable[[Optimizer], LRScheduler]]] = None,
-    step_limit: Optional[int] = None,
-    timeout_sec: Optional[float] = None,
-    stopping_criterion: Optional[Callable[[Tensor], bool]] = None,
+    bounds: dict[str, tuple[float | None, float | None]] | None = None,
+    callback: Callable[[dict[str, Tensor], OptimizationResult], None] | None = None,
+    optimizer: Optimizer | Callable[[list[Tensor]], Optimizer] = Adam,
+    scheduler: LRScheduler | Callable[[Optimizer], LRScheduler] | None = None,
+    step_limit: int | None = None,
+    timeout_sec: float | None = None,
+    stopping_criterion: Callable[[Tensor], bool] | None = None,
 ) -> OptimizationResult:
     r"""Generic torch.optim-based optimization routine.
 
diff --git a/botorch/optim/fit.py b/botorch/optim/fit.py
index 828c5202ed..8920680b2f 100644
--- a/botorch/optim/fit.py
+++ b/botorch/optim/fit.py
@@ -8,10 +8,10 @@
 
 from __future__ import annotations
 
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 
 from functools import partial
-from typing import Any, Callable, Optional, Union
+from typing import Any, Optional
 from warnings import warn
 
 from botorch.exceptions.warnings import OptimizationWarning
@@ -46,14 +46,14 @@
 
 def fit_gpytorch_mll_scipy(
     mll: MarginalLogLikelihood,
-    parameters: Optional[dict[str, Tensor]] = None,
-    bounds: Optional[dict[str, tuple[Optional[float], Optional[float]]]] = None,
-    closure: Optional[Callable[[], tuple[Tensor, Sequence[Optional[Tensor]]]]] = None,
-    closure_kwargs: Optional[dict[str, Any]] = None,
+    parameters: dict[str, Tensor] | None = None,
+    bounds: dict[str, tuple[float | None, float | None]] | None = None,
+    closure: Callable[[], tuple[Tensor, Sequence[Tensor | None]]] | None = None,
+    closure_kwargs: dict[str, Any] | None = None,
     method: str = "L-BFGS-B",
-    options: Optional[dict[str, Any]] = None,
-    callback: Optional[Callable[[dict[str, Tensor], OptimizationResult], None]] = None,
-    timeout_sec: Optional[float] = None,
+    options: dict[str, Any] | None = None,
+    callback: Callable[[dict[str, Tensor], OptimizationResult], None] | None = None,
+    timeout_sec: float | None = None,
 ) -> OptimizationResult:
     r"""Generic scipy.optimized-based fitting routine for GPyTorch MLLs.
 
@@ -112,16 +112,16 @@ def fit_gpytorch_mll_scipy(
 
 def fit_gpytorch_mll_torch(
     mll: MarginalLogLikelihood,
-    parameters: Optional[dict[str, Tensor]] = None,
-    bounds: Optional[dict[str, tuple[Optional[float], Optional[float]]]] = None,
-    closure: Optional[Callable[[], tuple[Tensor, Sequence[Optional[Tensor]]]]] = None,
-    closure_kwargs: Optional[dict[str, Any]] = None,
-    step_limit: Optional[int] = None,
-    stopping_criterion: Optional[Callable[[Tensor], bool]] = DEFAULT,  # pyre-ignore [9]
-    optimizer: Union[Optimizer, Callable[..., Optimizer]] = Adam,
-    scheduler: Optional[Union[_LRScheduler, Callable[..., _LRScheduler]]] = None,
-    callback: Optional[Callable[[dict[str, Tensor], OptimizationResult], None]] = None,
-    timeout_sec: Optional[float] = None,
+    parameters: dict[str, Tensor] | None = None,
+    bounds: dict[str, tuple[float | None, float | None]] | None = None,
+    closure: Callable[[], tuple[Tensor, Sequence[Tensor | None]]] | None = None,
+    closure_kwargs: dict[str, Any] | None = None,
+    step_limit: int | None = None,
+    stopping_criterion: Callable[[Tensor], bool] | None = DEFAULT,  # pyre-ignore [9]
+    optimizer: Optimizer | Callable[..., Optimizer] = Adam,
+    scheduler: _LRScheduler | Callable[..., _LRScheduler] | None = None,
+    callback: Callable[[dict[str, Tensor], OptimizationResult], None] | None = None,
+    timeout_sec: float | None = None,
 ) -> OptimizationResult:
     r"""Generic torch.optim-based fitting routine for GPyTorch MLLs.
 
diff --git a/botorch/optim/homotopy.py b/botorch/optim/homotopy.py
index 4aaed8e076..be062e3bfa 100644
--- a/botorch/optim/homotopy.py
+++ b/botorch/optim/homotopy.py
@@ -6,8 +6,8 @@
 from __future__ import annotations
 
 import math
+from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Callable, Optional, Union
 
 import torch
 from torch import Tensor
@@ -87,7 +87,7 @@ class HomotopyParameter:
     correspond to a buffer of a module. The parameter has a corresponding schedule.
     """
 
-    parameter: Union[Parameter, Tensor]
+    parameter: Parameter | Tensor
     schedule: FixedHomotopySchedule
 
 
@@ -104,7 +104,7 @@ class Homotopy:
     def __init__(
         self,
         homotopy_parameters: list[HomotopyParameter],
-        callbacks: Optional[list[Callable]] = None,
+        callbacks: list[Callable] | None = None,
     ) -> None:
         r"""Initialize the homotopy.
 
diff --git a/botorch/optim/initializers.py b/botorch/optim/initializers.py
index 17f9edfa72..4afe1b1a2a 100644
--- a/botorch/optim/initializers.py
+++ b/botorch/optim/initializers.py
@@ -15,8 +15,9 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Callable
 from math import ceil
-from typing import Callable, Optional, Union
+from typing import Optional, Union
 
 import torch
 from botorch import settings
@@ -70,7 +71,7 @@
 
 
 def transform_constraints(
-    constraints: Union[list[tuple[Tensor, Tensor, float]], None], q: int, d: int
+    constraints: list[tuple[Tensor, Tensor, float]] | None, q: int, d: int
 ) -> list[tuple[Tensor, Tensor, float]]:
     r"""Transform constraints to sample from a d*q-dimensional space instead of a
     d-dimensional state.
@@ -182,8 +183,8 @@ def sample_q_batches_from_polytope(
     n_burnin: int,
     n_thinning: int,
     seed: int,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
 ) -> Tensor:
     r"""Samples `n` q-baches from a polytope of dimension `d`.
 
@@ -246,12 +247,12 @@ def gen_batch_initial_conditions(
     q: int,
     num_restarts: int,
     raw_samples: int,
-    fixed_features: Optional[dict[int, float]] = None,
-    options: Optional[dict[str, Union[bool, float, int]]] = None,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    generator: Optional[Callable[[int, int, Optional[int]], Tensor]] = None,
-    fixed_X_fantasies: Optional[Tensor] = None,
+    fixed_features: dict[int, float] | None = None,
+    options: dict[str, bool | float | int] | None = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    generator: Callable[[int, int, int | None], Tensor] | None = None,
+    fixed_X_fantasies: Tensor | None = None,
 ) -> Tensor:
     r"""Generate a batch of initial conditions for random-restart optimziation.
 
@@ -318,8 +319,8 @@ def gen_batch_initial_conditions(
             "Option 'sample_around_best' is not supported when custom "
             "generator is be used."
         )
-    seed: Optional[int] = options.get("seed")
-    batch_limit: Optional[int] = options.get(
+    seed: int | None = options.get("seed")
+    batch_limit: int | None = options.get(
         "init_batch_limit", options.get("batch_limit")
     )
     factor, max_factor = 1, 5
@@ -444,11 +445,11 @@ def gen_one_shot_kg_initial_conditions(
     q: int,
     num_restarts: int,
     raw_samples: int,
-    fixed_features: Optional[dict[int, float]] = None,
-    options: Optional[dict[str, Union[bool, float, int]]] = None,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-) -> Optional[Tensor]:
+    fixed_features: dict[int, float] | None = None,
+    options: dict[str, bool | float | int] | None = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+) -> Tensor | None:
     r"""Generate a batch of smart initializations for qKnowledgeGradient.
 
     This function generates initial conditions for optimizing one-shot KG using
@@ -563,11 +564,11 @@ def gen_one_shot_hvkg_initial_conditions(
     q: int,
     num_restarts: int,
     raw_samples: int,
-    fixed_features: Optional[dict[int, float]] = None,
-    options: Optional[dict[str, Union[bool, float, int]]] = None,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-) -> Optional[Tensor]:
+    fixed_features: dict[int, float] | None = None,
+    options: dict[str, bool | float | int] | None = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+) -> Tensor | None:
     r"""Generate a batch of smart initializations for qHypervolumeKnowledgeGradient.
 
     This function generates initial conditions for optimizing one-shot HVKG using
@@ -761,8 +762,8 @@ def gen_value_function_initial_conditions(
     num_restarts: int,
     raw_samples: int,
     current_model: Model,
-    fixed_features: Optional[dict[int, float]] = None,
-    options: Optional[dict[str, Union[bool, float, int]]] = None,
+    fixed_features: dict[int, float] | None = None,
+    options: dict[str, bool | float | int] | None = None,
 ) -> Tensor:
     r"""Generate a batch of smart initializations for optimizing
     the value function of qKnowledgeGradient.
@@ -818,7 +819,7 @@ def gen_value_function_initial_conditions(
         >>> )
     """
     options = options or {}
-    seed: Optional[int] = options.get("seed")
+    seed: int | None = options.get("seed")
     frac_random: float = options.get("frac_random", 0.6)
     if not 0 < frac_random < 1:
         raise ValueError(
@@ -1044,8 +1045,8 @@ def sample_points_around_best(
     bounds: Tensor,
     best_pct: float = 5.0,
     subset_sigma: float = 1e-1,
-    prob_perturb: Optional[float] = None,
-) -> Optional[Tensor]:
+    prob_perturb: float | None = None,
+) -> Tensor | None:
     r"""Find best points and sample nearby points.
 
     Args:
@@ -1199,7 +1200,7 @@ def sample_perturbed_subset_dims(
     n_discrete_points: int,
     sigma: float = 1e-1,
     qmc: bool = True,
-    prob_perturb: Optional[float] = None,
+    prob_perturb: float | None = None,
 ) -> Tensor:
     r"""Sample around `X` by perturbing a subset of the dimensions.
 
diff --git a/botorch/optim/optimize.py b/botorch/optim/optimize.py
index 76e6ab175a..b0014f7f4b 100644
--- a/botorch/optim/optimize.py
+++ b/botorch/optim/optimize.py
@@ -12,7 +12,8 @@
 
 import dataclasses
 import warnings
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable
+from typing import Any
 
 import torch
 from botorch.acquisition.acquisition import (
@@ -66,19 +67,19 @@ class OptimizeAcqfInputs:
     bounds: Tensor
     q: int
     num_restarts: int
-    raw_samples: Optional[int]
-    options: Optional[dict[str, Union[bool, float, int, str]]]
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]]
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]]
-    nonlinear_inequality_constraints: Optional[list[tuple[Callable, bool]]]
-    fixed_features: Optional[dict[int, float]]
-    post_processing_func: Optional[Callable[[Tensor], Tensor]]
-    batch_initial_conditions: Optional[Tensor]
+    raw_samples: int | None
+    options: dict[str, bool | float | int | str] | None
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None
+    nonlinear_inequality_constraints: list[tuple[Callable, bool]] | None
+    fixed_features: dict[int, float] | None
+    post_processing_func: Callable[[Tensor], Tensor] | None
+    batch_initial_conditions: Tensor | None
     return_best_only: bool
     gen_candidates: TGenCandidates
     sequential: bool
-    ic_generator: Optional[TGenInitialConditions] = None
-    timeout_sec: Optional[float] = None
+    ic_generator: TGenInitialConditions | None = None
+    timeout_sec: float | None = None
     return_full_tree: bool = False
     retry_on_optimization_warning: bool = True
     ic_gen_kwargs: dict = dataclasses.field(default_factory=dict)
@@ -332,7 +333,7 @@ def _optimize_batch_candidates() -> tuple[Tensor, Tensor, list[Warning]]:
     batch_candidates, batch_acq_values, ws = _optimize_batch_candidates()
 
     optimization_warning_raised = any(
-        (issubclass(w.category, OptimizationWarning) for w in ws)
+        issubclass(w.category, OptimizationWarning) for w in ws
     )
     if optimization_warning_raised and opt_inputs.retry_on_optimization_warning:
         first_warn_msg = (
@@ -366,7 +367,7 @@ def _optimize_batch_candidates() -> tuple[Tensor, Tensor, list[Warning]]:
             batch_candidates, batch_acq_values, ws = _optimize_batch_candidates()
 
             optimization_warning_raised = any(
-                (issubclass(w.category, OptimizationWarning) for w in ws)
+                issubclass(w.category, OptimizationWarning) for w in ws
             )
             if optimization_warning_raised:
                 warnings.warn(
@@ -403,20 +404,20 @@ def optimize_acqf(
     bounds: Tensor,
     q: int,
     num_restarts: int,
-    raw_samples: Optional[int] = None,
-    options: Optional[dict[str, Union[bool, float, int, str]]] = None,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    nonlinear_inequality_constraints: Optional[list[tuple[Callable, bool]]] = None,
-    fixed_features: Optional[dict[int, float]] = None,
-    post_processing_func: Optional[Callable[[Tensor], Tensor]] = None,
-    batch_initial_conditions: Optional[Tensor] = None,
+    raw_samples: int | None = None,
+    options: dict[str, bool | float | int | str] | None = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    nonlinear_inequality_constraints: list[tuple[Callable, bool]] | None = None,
+    fixed_features: dict[int, float] | None = None,
+    post_processing_func: Callable[[Tensor], Tensor] | None = None,
+    batch_initial_conditions: Tensor | None = None,
     return_best_only: bool = True,
-    gen_candidates: Optional[TGenCandidates] = None,
+    gen_candidates: TGenCandidates | None = None,
     sequential: bool = False,
     *,
-    ic_generator: Optional[TGenInitialConditions] = None,
-    timeout_sec: Optional[float] = None,
+    ic_generator: TGenInitialConditions | None = None,
+    timeout_sec: float | None = None,
     return_full_tree: bool = False,
     retry_on_optimization_warning: bool = True,
     **ic_gen_kwargs: Any,
@@ -573,17 +574,17 @@ def optimize_acqf_cyclic(
     bounds: Tensor,
     q: int,
     num_restarts: int,
-    raw_samples: Optional[int] = None,
-    options: Optional[dict[str, Union[bool, float, int, str]]] = None,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    fixed_features: Optional[dict[int, float]] = None,
-    post_processing_func: Optional[Callable[[Tensor], Tensor]] = None,
-    batch_initial_conditions: Optional[Tensor] = None,
-    cyclic_options: Optional[dict[str, Union[bool, float, int, str]]] = None,
+    raw_samples: int | None = None,
+    options: dict[str, bool | float | int | str] | None = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    fixed_features: dict[int, float] | None = None,
+    post_processing_func: Callable[[Tensor], Tensor] | None = None,
+    batch_initial_conditions: Tensor | None = None,
+    cyclic_options: dict[str, bool | float | int | str] | None = None,
     *,
-    ic_generator: Optional[TGenInitialConditions] = None,
-    timeout_sec: Optional[float] = None,
+    ic_generator: TGenInitialConditions | None = None,
+    timeout_sec: float | None = None,
     return_full_tree: bool = False,
     retry_on_optimization_warning: bool = True,
     **ic_gen_kwargs: Any,
@@ -708,16 +709,16 @@ def optimize_acqf_list(
     acq_function_list: list[AcquisitionFunction],
     bounds: Tensor,
     num_restarts: int,
-    raw_samples: Optional[int] = None,
-    options: Optional[dict[str, Union[bool, float, int, str]]] = None,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    nonlinear_inequality_constraints: Optional[list[tuple[Callable, bool]]] = None,
-    fixed_features: Optional[dict[int, float]] = None,
-    fixed_features_list: Optional[list[dict[int, float]]] = None,
-    post_processing_func: Optional[Callable[[Tensor], Tensor]] = None,
-    ic_generator: Optional[TGenInitialConditions] = None,
-    ic_gen_kwargs: Optional[dict] = None,
+    raw_samples: int | None = None,
+    options: dict[str, bool | float | int | str] | None = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    nonlinear_inequality_constraints: list[tuple[Callable, bool]] | None = None,
+    fixed_features: dict[int, float] | None = None,
+    fixed_features_list: list[dict[int, float]] | None = None,
+    post_processing_func: Callable[[Tensor], Tensor] | None = None,
+    ic_generator: TGenInitialConditions | None = None,
+    ic_gen_kwargs: dict | None = None,
 ) -> tuple[Tensor, Tensor]:
     r"""Generate a list of candidates from a list of acquisition functions.
 
@@ -842,15 +843,15 @@ def optimize_acqf_mixed(
     q: int,
     num_restarts: int,
     fixed_features_list: list[dict[int, float]],
-    raw_samples: Optional[int] = None,
-    options: Optional[dict[str, Union[bool, float, int, str]]] = None,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    nonlinear_inequality_constraints: Optional[list[tuple[Callable, bool]]] = None,
-    post_processing_func: Optional[Callable[[Tensor], Tensor]] = None,
-    batch_initial_conditions: Optional[Tensor] = None,
-    ic_generator: Optional[TGenInitialConditions] = None,
-    ic_gen_kwargs: Optional[dict] = None,
+    raw_samples: int | None = None,
+    options: dict[str, bool | float | int | str] | None = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    nonlinear_inequality_constraints: list[tuple[Callable, bool]] | None = None,
+    post_processing_func: Callable[[Tensor], Tensor] | None = None,
+    batch_initial_conditions: Tensor | None = None,
+    ic_generator: TGenInitialConditions | None = None,
+    ic_gen_kwargs: dict | None = None,
 ) -> tuple[Tensor, Tensor]:
     r"""Optimize over a list of fixed_features and returns the best solution.
 
@@ -1140,9 +1141,9 @@ def optimize_acqf_discrete_local_search(
     q: int,
     num_restarts: int = 20,
     raw_samples: int = 4096,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    X_avoid: Optional[Tensor] = None,
-    batch_initial_conditions: Optional[Tensor] = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    X_avoid: Tensor | None = None,
+    batch_initial_conditions: Tensor | None = None,
     max_batch_size: int = 2048,
     unique: bool = True,
 ) -> tuple[Tensor, Tensor]:
diff --git a/botorch/optim/optimize_homotopy.py b/botorch/optim/optimize_homotopy.py
index f0077f8522..cfad4a0b6e 100644
--- a/botorch/optim/optimize_homotopy.py
+++ b/botorch/optim/optimize_homotopy.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
 
 import torch
 from botorch.acquisition import AcquisitionFunction
@@ -52,12 +52,12 @@ def optimize_acqf_homotopy(
     q: int,
     homotopy: Homotopy,
     num_restarts: int,
-    raw_samples: Optional[int] = None,
-    fixed_features: Optional[dict[int, float]] = None,
-    options: Optional[dict[str, Union[bool, float, int, str]]] = None,
-    final_options: Optional[dict[str, Union[bool, float, int, str]]] = None,
-    batch_initial_conditions: Optional[Tensor] = None,
-    post_processing_func: Optional[Callable[[Tensor], Tensor]] = None,
+    raw_samples: int | None = None,
+    fixed_features: dict[int, float] | None = None,
+    options: dict[str, bool | float | int | str] | None = None,
+    final_options: dict[str, bool | float | int | str] | None = None,
+    batch_initial_conditions: Tensor | None = None,
+    post_processing_func: Callable[[Tensor], Tensor] | None = None,
     prune_tolerance: float = 1e-4,
 ) -> tuple[Tensor, Tensor]:
     r"""Generate a set of candidates via multi-start optimization.
diff --git a/botorch/optim/parameter_constraints.py b/botorch/optim/parameter_constraints.py
index b5536db48b..0f85235561 100644
--- a/botorch/optim/parameter_constraints.py
+++ b/botorch/optim/parameter_constraints.py
@@ -10,8 +10,10 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 from functools import partial
-from typing import Callable, Optional, Union
+from typing import Union
 
 import numpy as np
 import torch
@@ -28,9 +30,9 @@
 
 def make_scipy_bounds(
     X: Tensor,
-    lower_bounds: Optional[Union[float, Tensor]] = None,
-    upper_bounds: Optional[Union[float, Tensor]] = None,
-) -> Optional[Bounds]:
+    lower_bounds: float | Tensor | None = None,
+    upper_bounds: float | Tensor | None = None,
+) -> Bounds | None:
     r"""Creates a scipy Bounds object for optimziation
 
     Args:
@@ -51,7 +53,7 @@ def make_scipy_bounds(
     if lower_bounds is None and upper_bounds is None:
         return None
 
-    def _expand(bounds: Union[float, Tensor], X: Tensor, lower: bool) -> Tensor:
+    def _expand(bounds: float | Tensor, X: Tensor, lower: bool) -> Tensor:
         if bounds is None:
             ebounds = torch.full_like(X, float("-inf" if lower else "inf"))
         else:
@@ -67,8 +69,8 @@ def _expand(bounds: Union[float, Tensor], X: Tensor, lower: bool) -> Tensor:
 
 def make_scipy_linear_constraints(
     shapeX: torch.Size,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
 ) -> list[ScipyConstraintDict]:
     r"""Generate scipy constraints from torch representation.
 
@@ -370,10 +372,10 @@ def get_interpoint_constraint(b: int, nlc: Callable) -> Callable:
 
 
 def _generate_unfixed_nonlin_constraints(
-    constraints: Optional[list[tuple[Callable[[Tensor], Tensor], bool]]],
+    constraints: list[tuple[Callable[[Tensor], Tensor], bool]] | None,
     fixed_features: dict[int, float],
     dimension: int,
-) -> Optional[list[Callable[[Tensor], Tensor]]]:
+) -> list[Callable[[Tensor], Tensor]] | None:
     """Given a dictionary of fixed features, returns a list of callables for
     nonlinear inequality constraints expecting only a tensor with the non-fixed
     features as input.
@@ -410,11 +412,11 @@ def new_nonlin_constraint(X: Tensor) -> Tensor:
 
 
 def _generate_unfixed_lin_constraints(
-    constraints: Optional[list[tuple[Tensor, Tensor, float]]],
+    constraints: list[tuple[Tensor, Tensor, float]] | None,
     fixed_features: dict[int, float],
     dimension: int,
     eq: bool,
-) -> Optional[list[tuple[Tensor, Tensor, float]]]:
+) -> list[tuple[Tensor, Tensor, float]] | None:
     # If constraints is None or an empty list, then return itself
     if not constraints:
         return constraints
diff --git a/botorch/optim/utils/acquisition_utils.py b/botorch/optim/utils/acquisition_utils.py
index 48292c4291..0feb68679e 100644
--- a/botorch/optim/utils/acquisition_utils.py
+++ b/botorch/optim/utils/acquisition_utils.py
@@ -8,7 +8,6 @@
 
 from __future__ import annotations
 
-from typing import Optional, Union
 from warnings import warn
 
 import torch
@@ -21,8 +20,8 @@
 
 def columnwise_clamp(
     X: Tensor,
-    lower: Optional[Union[float, Tensor]] = None,
-    upper: Optional[Union[float, Tensor]] = None,
+    lower: float | Tensor | None = None,
+    upper: float | Tensor | None = None,
     raise_on_violation: bool = False,
 ) -> Tensor:
     r"""Clamp values of a Tensor in column-wise fashion (with support for t-batches).
@@ -64,7 +63,7 @@ def columnwise_clamp(
 
 
 def fix_features(
-    X: Tensor, fixed_features: Optional[dict[int, Optional[float]]] = None
+    X: Tensor, fixed_features: dict[int, float | None] | None = None
 ) -> Tensor:
     r"""Fix feature values in a Tensor.
 
@@ -93,7 +92,7 @@ def fix_features(
     return torch.stack(columns, dim=-1)
 
 
-def get_X_baseline(acq_function: AcquisitionFunction) -> Optional[Tensor]:
+def get_X_baseline(acq_function: AcquisitionFunction) -> Tensor | None:
     r"""Extract X_baseline from an acquisition function.
 
     This tries to find the baseline set of points. First, this checks if the
diff --git a/botorch/optim/utils/common.py b/botorch/optim/utils/common.py
index 44990ece06..93fde01048 100644
--- a/botorch/optim/utils/common.py
+++ b/botorch/optim/utils/common.py
@@ -8,8 +8,9 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 from logging import debug as logging_debug
-from typing import Callable, Optional
 from warnings import warn_explicit, WarningMessage
 
 import numpy as np
@@ -17,7 +18,7 @@
 
 
 def _handle_numerical_errors(
-    error: RuntimeError, x: np.ndarray, dtype: Optional[np.dtype] = None
+    error: RuntimeError, x: np.ndarray, dtype: np.dtype | None = None
 ) -> tuple[np.ndarray, np.ndarray]:
     if isinstance(error, NotPSDError):
         raise error
@@ -34,8 +35,8 @@ def _handle_numerical_errors(
 
 def _warning_handler_template(
     w: WarningMessage,
-    debug: Optional[Callable[[WarningMessage], bool]] = None,
-    rethrow: Optional[Callable[[WarningMessage], bool]] = None,
+    debug: Callable[[WarningMessage], bool] | None = None,
+    rethrow: Callable[[WarningMessage], bool] | None = None,
 ) -> bool:
     r"""Helper for making basic warning handlers. Typically used with functools.partial.
 
diff --git a/botorch/optim/utils/model_utils.py b/botorch/optim/utils/model_utils.py
index 092410c3fb..f2ad418468 100644
--- a/botorch/optim/utils/model_utils.py
+++ b/botorch/optim/utils/model_utils.py
@@ -8,10 +8,10 @@
 
 from __future__ import annotations
 
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 
 from re import Pattern
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 from warnings import warn
 
 import torch
@@ -39,8 +39,8 @@ def get_data_loader(
 
 def get_parameters(
     module: Module,
-    requires_grad: Optional[bool] = None,
-    name_filter: Optional[Callable[[str], bool]] = None,
+    requires_grad: bool | None = None,
+    name_filter: Callable[[str], bool] | None = None,
 ) -> dict[str, Tensor]:
     r"""Helper method for obtaining a module's parameters and their respective ranges.
 
@@ -68,10 +68,10 @@ def get_parameters(
 
 def get_parameters_and_bounds(
     module: Module,
-    requires_grad: Optional[bool] = None,
-    name_filter: Optional[Callable[[str], bool]] = None,
+    requires_grad: bool | None = None,
+    name_filter: Callable[[str], bool] | None = None,
     default_bounds: tuple[float, float] = (-float("inf"), float("inf")),
-) -> tuple[dict[str, Tensor], dict[str, tuple[Optional[float], Optional[float]]]]:
+) -> tuple[dict[str, Tensor], dict[str, tuple[float | None, float | None]]]:
     r"""Helper method for obtaining a module's parameters and their respective ranges.
 
     Args:
@@ -110,8 +110,8 @@ def get_parameters_and_bounds(
 
 
 def get_name_filter(
-    patterns: Iterator[Union[Pattern, str]]
-) -> Callable[[Union[str, tuple[str, Any, ...]]], bool]:
+    patterns: Iterator[Pattern | str],
+) -> Callable[[str | tuple[str, Any, ...]], bool]:
     r"""Returns a binary function that filters strings (or iterables whose first
     element is a string) according to a bank of excluded patterns. Typically, used
     in conjunction with generators such as `module.named_parameters()`.
@@ -136,7 +136,7 @@ def get_name_filter(
                 f"but found {type(pattern)}."
             )
 
-    def name_filter(item: Union[str, tuple[str, Any, ...]]) -> bool:
+    def name_filter(item: str | tuple[str, Any, ...]) -> bool:
         name = item if isinstance(item, str) else next(iter(item))
         if name in names:
             return False
diff --git a/botorch/optim/utils/numpy_utils.py b/botorch/optim/utils/numpy_utils.py
index fc815ea17f..2ffe3dabe4 100644
--- a/botorch/optim/utils/numpy_utils.py
+++ b/botorch/optim/utils/numpy_utils.py
@@ -8,10 +8,9 @@
 
 from __future__ import annotations
 
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 
 from itertools import tee
-from typing import Callable, Optional, Union
 
 import numpy as np
 import torch
@@ -35,7 +34,7 @@
 
 
 def as_ndarray(
-    values: Tensor, dtype: Optional[np.dtype] = None, inplace: bool = True
+    values: Tensor, dtype: np.dtype | None = None, inplace: bool = True
 ) -> ndarray:
     r"""Helper for going from torch.Tensor to numpy.ndarray.
 
@@ -67,9 +66,9 @@ def as_ndarray(
 
 
 def get_tensors_as_ndarray_1d(
-    tensors: Union[Iterator[Tensor], dict[str, Tensor]],
-    out: Optional[ndarray] = None,
-    dtype: Optional[Union[np.dtype, str]] = None,
+    tensors: Iterator[Tensor] | dict[str, Tensor],
+    out: ndarray | None = None,
+    dtype: np.dtype | str | None = None,
     as_array: Callable[[Tensor], ndarray] = as_ndarray,
 ) -> ndarray:
     # Create a pair of iterators, one for setup and one for data transfer
@@ -112,7 +111,7 @@ def get_tensors_as_ndarray_1d(
 
 
 def set_tensors_from_ndarray_1d(
-    tensors: Union[Iterator[Tensor], dict[str, Tensor]],
+    tensors: Iterator[Tensor] | dict[str, Tensor],
     array: ndarray,
     as_tensor: Callable[[ndarray], Tensor] = torch.as_tensor,
 ) -> None:
@@ -137,10 +136,8 @@ def set_tensors_from_ndarray_1d(
 
 def get_bounds_as_ndarray(
     parameters: dict[str, Tensor],
-    bounds: dict[
-        str, tuple[Optional[Union[float, Tensor]], Optional[Union[float, Tensor]]]
-    ],
-) -> Optional[np.ndarray]:
+    bounds: dict[str, tuple[float | Tensor | None, float | Tensor | None]],
+) -> np.ndarray | None:
     r"""Helper method for converting bounds into an ndarray.
 
     Args:
diff --git a/botorch/optim/utils/timeout.py b/botorch/optim/utils/timeout.py
index eea0d98963..38f3d3d3e1 100644
--- a/botorch/optim/utils/timeout.py
+++ b/botorch/optim/utils/timeout.py
@@ -8,8 +8,8 @@
 
 import time
 import warnings
-from collections.abc import Sequence
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Sequence
+from typing import Any
 
 import numpy as np
 from botorch.exceptions.errors import OptimizationTimeoutError
@@ -20,16 +20,16 @@ def minimize_with_timeout(
     fun: Callable[[np.ndarray, ...], float],
     x0: np.ndarray,
     args: tuple[Any, ...] = (),
-    method: Optional[str] = None,
-    jac: Optional[Union[str, Callable, bool]] = None,
-    hess: Optional[Union[str, Callable, optimize.HessianUpdateStrategy]] = None,
-    hessp: Optional[Callable] = None,
-    bounds: Optional[Union[Sequence[tuple[float, float]], optimize.Bounds]] = None,
+    method: str | None = None,
+    jac: str | Callable | bool | None = None,
+    hess: str | Callable | optimize.HessianUpdateStrategy | None = None,
+    hessp: Callable | None = None,
+    bounds: Sequence[tuple[float, float]] | optimize.Bounds | None = None,
     constraints=(),  # Typing this properly is a s**t job
-    tol: Optional[float] = None,
-    callback: Optional[Callable] = None,
-    options: Optional[dict[str, Any]] = None,
-    timeout_sec: Optional[float] = None,
+    tol: float | None = None,
+    callback: Callable | None = None,
+    options: dict[str, Any] | None = None,
+    timeout_sec: float | None = None,
 ) -> optimize.OptimizeResult:
     r"""Wrapper around scipy.optimize.minimize to support timeout.
 
diff --git a/botorch/posteriors/ensemble.py b/botorch/posteriors/ensemble.py
index 7eeebb4cf2..a3fb8b5ffe 100644
--- a/botorch/posteriors/ensemble.py
+++ b/botorch/posteriors/ensemble.py
@@ -10,7 +10,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from botorch.posteriors.posterior import Posterior
@@ -79,7 +78,7 @@ def _extended_shape(
 
     def rsample(
         self,
-        sample_shape: Optional[torch.Size] = None,
+        sample_shape: torch.Size | None = None,
     ) -> Tensor:
         r"""Sample from the posterior (with gradients).
 
diff --git a/botorch/posteriors/fully_bayesian.py b/botorch/posteriors/fully_bayesian.py
index 9d0fdaadc3..c47882adb0 100644
--- a/botorch/posteriors/fully_bayesian.py
+++ b/botorch/posteriors/fully_bayesian.py
@@ -5,7 +5,8 @@
 
 from __future__ import annotations
 
-from typing import Callable, Optional
+from collections.abc import Callable
+
 from warnings import warn
 
 import torch
@@ -105,9 +106,9 @@ def __init__(self, distribution: MultivariateNormal) -> None:
         )
         self._covariance_matrix = distribution.lazy_covariance_matrix
 
-        self._mixture_mean: Optional[Tensor] = None
-        self._mixture_variance: Optional[Tensor] = None
-        self._mixture_covariance_matrix: Optional[Tensor] = None
+        self._mixture_mean: Tensor | None = None
+        self._mixture_variance: Tensor | None = None
+        self._mixture_covariance_matrix: Tensor | None = None
 
     @property
     def mixture_mean(self) -> Tensor:
diff --git a/botorch/posteriors/gpytorch.py b/botorch/posteriors/gpytorch.py
index 4e29f69287..8db42545ad 100644
--- a/botorch/posteriors/gpytorch.py
+++ b/botorch/posteriors/gpytorch.py
@@ -11,7 +11,7 @@
 from __future__ import annotations
 
 from contextlib import ExitStack
-from typing import Optional, TYPE_CHECKING, Union
+from typing import TYPE_CHECKING
 
 import torch
 from botorch.exceptions.errors import BotorchTensorDimensionError
@@ -125,7 +125,7 @@ def rsample_from_base_samples(
             samples = samples.unsqueeze(-1)
         return samples
 
-    def rsample(self, sample_shape: Optional[torch.Size] = None) -> Tensor:
+    def rsample(self, sample_shape: torch.Size | None = None) -> Tensor:
         r"""Sample from the posterior (with gradients).
 
         Args:
@@ -193,7 +193,7 @@ def scalarize_posterior_gpytorch(
     posterior: GPyTorchPosterior,
     weights: Tensor,
     offset: float = 0.0,
-) -> tuple[Tensor, Union[Tensor, LinearOperator]]:
+) -> tuple[Tensor, Tensor | LinearOperator]:
     r"""Helper function for `scalarize_posterior`, producing a mean and
     variance.
 
@@ -271,7 +271,7 @@ def scalarize_posterior_gpytorch(
 
 
 def scalarize_posterior(
-    posterior: Union[GPyTorchPosterior, PosteriorList],
+    posterior: GPyTorchPosterior | PosteriorList,
     weights: Tensor,
     offset: float = 0.0,
 ) -> GPyTorchPosterior:
diff --git a/botorch/posteriors/higher_order.py b/botorch/posteriors/higher_order.py
index 77581c393a..d22375faa8 100644
--- a/botorch/posteriors/higher_order.py
+++ b/botorch/posteriors/higher_order.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional
 
 import torch
 from botorch.exceptions.errors import BotorchTensorDimensionError
@@ -162,7 +161,7 @@ def _prepare_base_samples(
     def rsample_from_base_samples(
         self,
         sample_shape: torch.Size,
-        base_samples: Optional[Tensor],
+        base_samples: Tensor | None,
     ) -> Tensor:
         r"""Sample from the posterior (with gradients) using base samples.
 
@@ -244,7 +243,7 @@ def rsample_from_base_samples(
 
     def rsample(
         self,
-        sample_shape: Optional[torch.Size] = None,
+        sample_shape: torch.Size | None = None,
     ) -> Tensor:
         r"""Sample from the posterior (with gradients).
 
diff --git a/botorch/posteriors/multitask.py b/botorch/posteriors/multitask.py
index 76a2df43d4..c912499078 100644
--- a/botorch/posteriors/multitask.py
+++ b/botorch/posteriors/multitask.py
@@ -3,7 +3,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional, Union
 
 import torch
 from botorch.exceptions.errors import BotorchTensorDimensionError
@@ -22,8 +21,8 @@ def __init__(
         train_diff: Tensor,
         test_mean: Tensor,
         train_train_covar: LinearOperator,
-        train_noise: Union[LinearOperator, Tensor],
-        test_noise: Optional[Union[LinearOperator, Tensor]] = None,
+        train_noise: LinearOperator | Tensor,
+        test_noise: LinearOperator | Tensor | None = None,
     ):
         r"""
         Posterior class for a Kronecker Multi-task GP model using with ICM kernel.
@@ -184,8 +183,8 @@ def _prepare_base_samples(
     def rsample_from_base_samples(
         self,
         sample_shape: torch.Size,
-        base_samples: Optional[Tensor],
-        train_diff: Optional[Tensor] = None,
+        base_samples: Tensor | None,
+        train_diff: Tensor | None = None,
     ) -> Tensor:
         r"""Sample from the posterior (with gradients) using base samples.
 
@@ -255,7 +254,7 @@ def rsample_from_base_samples(
 
     def rsample(
         self,
-        sample_shape: Optional[torch.Size] = None,
+        sample_shape: torch.Size | None = None,
     ) -> Tensor:
         r"""Sample from the posterior (with gradients).
 
@@ -275,7 +274,7 @@ def rsample(
         )
 
     def _draw_from_base_covar(
-        self, covar: Union[Tensor, LinearOperator], base_samples: Tensor
+        self, covar: Tensor | LinearOperator, base_samples: Tensor
     ) -> Tensor:
         # Now reparameterize those base samples
         if not isinstance(covar, LinearOperator):
diff --git a/botorch/posteriors/posterior.py b/botorch/posteriors/posterior.py
index a872d7988c..d15a829cef 100644
--- a/botorch/posteriors/posterior.py
+++ b/botorch/posteriors/posterior.py
@@ -11,7 +11,6 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod, abstractproperty
-from typing import Optional
 
 import torch
 from torch import Tensor
@@ -48,7 +47,7 @@ def rsample_from_base_samples(
     @abstractmethod
     def rsample(
         self,
-        sample_shape: Optional[torch.Size] = None,
+        sample_shape: torch.Size | None = None,
     ) -> Tensor:
         r"""Sample from the posterior (with gradients).
 
@@ -63,7 +62,7 @@ def rsample(
         """
         pass  # pragma: no cover
 
-    def sample(self, sample_shape: Optional[torch.Size] = None) -> Tensor:
+    def sample(self, sample_shape: torch.Size | None = None) -> Tensor:
         r"""Sample from the posterior without gradients.
 
         Args:
diff --git a/botorch/posteriors/posterior_list.py b/botorch/posteriors/posterior_list.py
index f49ed55d63..bc64943a71 100644
--- a/botorch/posteriors/posterior_list.py
+++ b/botorch/posteriors/posterior_list.py
@@ -11,7 +11,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from botorch.posteriors.fully_bayesian import GaussianMixturePosterior, MCMC_DIM
@@ -154,7 +154,7 @@ def variance(self) -> Tensor:
         """
         return self._reshape_and_cat(tensors=[p.variance for p in self.posteriors])
 
-    def rsample(self, sample_shape: Optional[torch.Size] = None) -> Tensor:
+    def rsample(self, sample_shape: torch.Size | None = None) -> Tensor:
         r"""Sample from the posterior (with gradients).
 
         Args:
diff --git a/botorch/posteriors/torch.py b/botorch/posteriors/torch.py
index 140bcad88a..704152d219 100644
--- a/botorch/posteriors/torch.py
+++ b/botorch/posteriors/torch.py
@@ -10,7 +10,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from botorch.posteriors.posterior import Posterior
@@ -42,7 +42,7 @@ def __init__(self, distribution: Distribution) -> None:
 
     def rsample(
         self,
-        sample_shape: Optional[torch.Size] = None,
+        sample_shape: torch.Size | None = None,
     ) -> Tensor:
         r"""Sample from the posterior (with gradients).
 
diff --git a/botorch/posteriors/transformed.py b/botorch/posteriors/transformed.py
index fa5721c720..3109827d62 100644
--- a/botorch/posteriors/transformed.py
+++ b/botorch/posteriors/transformed.py
@@ -6,7 +6,8 @@
 
 from __future__ import annotations
 
-from typing import Callable, Optional
+from collections.abc import Callable
+
 
 import torch
 from botorch.posteriors.posterior import Posterior
@@ -20,8 +21,8 @@ def __init__(
         self,
         posterior: Posterior,
         sample_transform: Callable[[Tensor], Tensor],
-        mean_transform: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
-        variance_transform: Optional[Callable[[Tensor, Tensor], Tensor]] = None,
+        mean_transform: Callable[[Tensor, Tensor], Tensor] | None = None,
+        variance_transform: Callable[[Tensor, Tensor], Tensor] | None = None,
     ) -> None:
         r"""An implicitly represented transformed posterior.
 
@@ -126,7 +127,7 @@ def rsample_from_base_samples(
 
     def rsample(
         self,
-        sample_shape: Optional[torch.Size] = None,
+        sample_shape: torch.Size | None = None,
     ) -> Tensor:
         r"""Sample from the posterior (with gradients).
 
diff --git a/botorch/sampling/base.py b/botorch/sampling/base.py
index 6bf3e7fa6d..69ff46a32d 100644
--- a/botorch/sampling/base.py
+++ b/botorch/sampling/base.py
@@ -11,7 +11,6 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 from botorch.exceptions.errors import InputDataError
@@ -45,7 +44,7 @@ class MCSampler(Module, ABC):
     def __init__(
         self,
         sample_shape: torch.Size,
-        seed: Optional[int] = None,
+        seed: int | None = None,
     ) -> None:
         r"""Abstract base class for samplers.
 
diff --git a/botorch/sampling/get_sampler.py b/botorch/sampling/get_sampler.py
index 5ff4dd331e..6ebb9fdcdc 100644
--- a/botorch/sampling/get_sampler.py
+++ b/botorch/sampling/get_sampler.py
@@ -5,8 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 
-from typing import Optional, Union
-
 import torch
 from botorch.logging import logger
 from botorch.posteriors.ensemble import EnsemblePosterior
@@ -31,7 +29,7 @@
 
 def _posterior_to_distribution_encoder(
     posterior: Posterior,
-) -> Union[type[Distribution], type[Posterior]]:
+) -> type[Distribution] | type[Posterior]:
     r"""An encoder returning the type of the distribution for `TorchPosterior`
     and the type of the posterior for the rest.
     """
@@ -47,7 +45,7 @@ def get_sampler(
     posterior: TorchPosterior,
     sample_shape: torch.Size,
     *,
-    seed: Optional[int] = None,
+    seed: int | None = None,
 ) -> MCSampler:
     r"""Get the sampler for the given posterior.
 
@@ -72,7 +70,7 @@ def _get_sampler_mvn(
     posterior: GPyTorchPosterior,
     sample_shape: torch.Size,
     *,
-    seed: Optional[int] = None,
+    seed: int | None = None,
 ) -> NormalMCSampler:
     r"""The Sobol normal sampler for the `MultivariateNormal` posterior.
 
@@ -95,7 +93,7 @@ def _get_sampler_derived(
     posterior: TransformedPosterior,
     sample_shape: torch.Size,
     *,
-    seed: Optional[int] = None,
+    seed: int | None = None,
 ) -> MCSampler:
     r"""Get the sampler for the underlying posterior."""
     return get_sampler(
@@ -107,7 +105,7 @@ def _get_sampler_derived(
 
 @GetSampler.register(PosteriorList)
 def _get_sampler_list(
-    posterior: PosteriorList, sample_shape: torch.Size, *, seed: Optional[int] = None
+    posterior: PosteriorList, sample_shape: torch.Size, *, seed: int | None = None
 ) -> MCSampler:
     r"""Get the `ListSampler` with the appropriate list of samplers."""
     samplers = [
@@ -121,7 +119,7 @@ def _get_sampler_list(
 def _get_sampler_ensemble(
     posterior: EnsemblePosterior,
     sample_shape: torch.Size,
-    seed: Optional[int] = None,
+    seed: int | None = None,
 ) -> MCSampler:
     r"""Get the `IndexSampler` for the `EnsemblePosterior`."""
     return IndexSampler(sample_shape=sample_shape, seed=seed)
@@ -131,7 +129,7 @@ def _get_sampler_ensemble(
 def _not_found_error(
     posterior: Posterior,
     sample_shape: torch.Size,
-    seed: Optional[int] = None,
+    seed: int | None = None,
 ) -> None:
     raise NotImplementedError(
         f"A registered `MCSampler` for posterior {posterior} is not found. You can "
diff --git a/botorch/sampling/pairwise_samplers.py b/botorch/sampling/pairwise_samplers.py
index 6d8b0cdeb6..e248174b24 100644
--- a/botorch/sampling/pairwise_samplers.py
+++ b/botorch/sampling/pairwise_samplers.py
@@ -7,7 +7,7 @@
 from __future__ import annotations
 
 from itertools import combinations
-from typing import Any, Optional
+from typing import Any
 
 import numpy as np
 import torch
@@ -81,7 +81,7 @@ class PairwiseIIDNormalSampler(PairwiseMCSampler, IIDNormalSampler):
     def __init__(
         self,
         sample_shape: torch.Size,
-        seed: Optional[int] = None,
+        seed: int | None = None,
         max_num_comparisons: int = None,
         **kwargs: Any,
     ) -> None:
@@ -103,7 +103,7 @@ class PairwiseSobolQMCNormalSampler(PairwiseMCSampler, SobolQMCNormalSampler):
     def __init__(
         self,
         sample_shape: torch.Size,
-        seed: Optional[int] = None,
+        seed: int | None = None,
         max_num_comparisons: int = None,
         **kwargs: Any,
     ) -> None:
diff --git a/botorch/sampling/pathwise/features/generators.py b/botorch/sampling/pathwise/features/generators.py
index 42fd30c8d2..6cdc1ee9d6 100644
--- a/botorch/sampling/pathwise/features/generators.py
+++ b/botorch/sampling/pathwise/features/generators.py
@@ -16,7 +16,9 @@
 
 from __future__ import annotations
 
-from typing import Any, Callable
+from collections.abc import Callable
+
+from typing import Any
 
 import torch
 from botorch.exceptions.errors import UnsupportedError
diff --git a/botorch/sampling/pathwise/features/maps.py b/botorch/sampling/pathwise/features/maps.py
index ee4df1f0ef..5d00f30312 100644
--- a/botorch/sampling/pathwise/features/maps.py
+++ b/botorch/sampling/pathwise/features/maps.py
@@ -6,7 +6,6 @@
 
 from __future__ import annotations
 
-from typing import Optional, Union
 
 import torch
 from botorch.sampling.pathwise.utils import (
@@ -23,8 +22,8 @@
 class FeatureMap(TransformedModuleMixin, Module):
     num_outputs: int
     batch_shape: Size
-    input_transform: Optional[TInputTransform]
-    output_transform: Optional[TOutputTransform]
+    input_transform: TInputTransform | None
+    output_transform: TOutputTransform | None
 
 
 class KernelEvaluationMap(FeatureMap):
@@ -34,8 +33,8 @@ def __init__(
         self,
         kernel: Kernel,
         points: Tensor,
-        input_transform: Optional[TInputTransform] = None,
-        output_transform: Optional[TOutputTransform] = None,
+        input_transform: TInputTransform | None = None,
+        output_transform: TOutputTransform | None = None,
     ) -> None:
         r"""Initializes a KernelEvaluationMap instance:
 
@@ -62,7 +61,7 @@ def __init__(
         self.input_transform = input_transform
         self.output_transform = output_transform
 
-    def forward(self, x: Tensor) -> Union[Tensor, LinearOperator]:
+    def forward(self, x: Tensor) -> Tensor | LinearOperator:
         return self.kernel(x, self.points)
 
     @property
@@ -90,9 +89,9 @@ def __init__(
         self,
         kernel: Kernel,
         weight: Tensor,
-        bias: Optional[Tensor] = None,
-        input_transform: Optional[TInputTransform] = None,
-        output_transform: Optional[TOutputTransform] = None,
+        bias: Tensor | None = None,
+        input_transform: TInputTransform | None = None,
+        output_transform: TOutputTransform | None = None,
     ) -> None:
         r"""Initializes a KernelFeatureMap instance:
 
diff --git a/botorch/sampling/pathwise/paths.py b/botorch/sampling/pathwise/paths.py
index de840c4e1d..0b64792502 100644
--- a/botorch/sampling/pathwise/paths.py
+++ b/botorch/sampling/pathwise/paths.py
@@ -7,8 +7,8 @@
 from __future__ import annotations
 
 from abc import ABC
-from collections.abc import Iterable, Iterator, Mapping
-from typing import Any, Callable, Optional, Union
+from collections.abc import Callable, Iterable, Iterator, Mapping
+from typing import Any
 
 from botorch.exceptions.errors import UnsupportedError
 from botorch.sampling.pathwise.features import FeatureMap
@@ -30,10 +30,10 @@ class PathDict(SamplePath):
 
     def __init__(
         self,
-        paths: Optional[Mapping[str, SamplePath]] = None,
-        join: Optional[Callable[[list[Tensor]], Tensor]] = None,
-        input_transform: Optional[TInputTransform] = None,
-        output_transform: Optional[TOutputTransform] = None,
+        paths: Mapping[str, SamplePath] | None = None,
+        join: Callable[[list[Tensor]], Tensor] | None = None,
+        input_transform: TInputTransform | None = None,
+        output_transform: TOutputTransform | None = None,
     ) -> None:
         r"""Initializes a PathDict instance.
 
@@ -56,7 +56,7 @@ def __init__(
             else ModuleDict({} if paths is None else paths)
         )
 
-    def forward(self, x: Tensor, **kwargs: Any) -> Union[Tensor, dict[str, Tensor]]:
+    def forward(self, x: Tensor, **kwargs: Any) -> Tensor | dict[str, Tensor]:
         out = [path(x, **kwargs) for path in self.paths.values()]
         return dict(zip(self.paths, out)) if self.join is None else self.join(out)
 
@@ -90,10 +90,10 @@ class PathList(SamplePath):
 
     def __init__(
         self,
-        paths: Optional[Iterable[SamplePath]] = None,
-        join: Optional[Callable[[list[Tensor]], Tensor]] = None,
-        input_transform: Optional[TInputTransform] = None,
-        output_transform: Optional[TOutputTransform] = None,
+        paths: Iterable[SamplePath] | None = None,
+        join: Callable[[list[Tensor]], Tensor] | None = None,
+        input_transform: TInputTransform | None = None,
+        output_transform: TOutputTransform | None = None,
     ) -> None:
         r"""Initializes a PathList instance.
 
@@ -117,7 +117,7 @@ def __init__(
             else ModuleList({} if paths is None else paths)
         )
 
-    def forward(self, x: Tensor, **kwargs: Any) -> Union[Tensor, list[Tensor]]:
+    def forward(self, x: Tensor, **kwargs: Any) -> Tensor | list[Tensor]:
         out = [path(x, **kwargs) for path in self.paths]
         return out if self.join is None else self.join(out)
 
@@ -143,10 +143,10 @@ class GeneralizedLinearPath(SamplePath):
     def __init__(
         self,
         feature_map: FeatureMap,
-        weight: Union[Parameter, Tensor],
-        bias_module: Optional[Module] = None,
-        input_transform: Optional[TInputTransform] = None,
-        output_transform: Optional[TOutputTransform] = None,
+        weight: Parameter | Tensor,
+        bias_module: Module | None = None,
+        input_transform: TInputTransform | None = None,
+        output_transform: TOutputTransform | None = None,
     ):
         r"""Initializes a GeneralizedLinearPath instance.
 
diff --git a/botorch/sampling/pathwise/posterior_samplers.py b/botorch/sampling/pathwise/posterior_samplers.py
index 92b104123f..82d78fdd8c 100644
--- a/botorch/sampling/pathwise/posterior_samplers.py
+++ b/botorch/sampling/pathwise/posterior_samplers.py
@@ -17,7 +17,6 @@
 
 from __future__ import annotations
 
-from typing import Optional, Union
 
 import torch
 from botorch.exceptions.errors import UnsupportedError
@@ -68,8 +67,8 @@ def __init__(
         self,
         prior_paths: SamplePath,
         update_paths: SamplePath,
-        input_transform: Optional[TInputTransform] = None,
-        output_transform: Optional[TOutputTransform] = None,
+        input_transform: TInputTransform | None = None,
+        output_transform: TOutputTransform | None = None,
     ) -> None:
         r"""Initializes a MatheronPath instance.
 
@@ -89,7 +88,7 @@ def __init__(
 
 
 def get_matheron_path_model(
-    model: GP, sample_shape: Optional[Size] = None
+    model: GP, sample_shape: Size | None = None
 ) -> GenericDeterministicModel:
     r"""Generates a deterministic model using a single Matheron path drawn
     from the model's posterior.
@@ -226,7 +225,7 @@ def _draw_matheron_paths_ExactGP(
 
 @DrawMatheronPaths.register((ApproximateGP, ApproximateGPyTorchModel))
 def _draw_matheron_paths_ApproximateGP(
-    model: Union[ApproximateGP, ApproximateGPyTorchModel],
+    model: ApproximateGP | ApproximateGPyTorchModel,
     *,
     sample_shape: Size,
     prior_sampler: TPathwisePriorSampler,
diff --git a/botorch/sampling/pathwise/prior_samplers.py b/botorch/sampling/pathwise/prior_samplers.py
index 32bced7b27..4fba5b49f6 100644
--- a/botorch/sampling/pathwise/prior_samplers.py
+++ b/botorch/sampling/pathwise/prior_samplers.py
@@ -6,7 +6,9 @@
 
 from __future__ import annotations
 
-from typing import Any, Callable, Optional
+from collections.abc import Callable
+
+from typing import Any
 
 from botorch.models.approximate_gp import ApproximateGPyTorchModel
 from botorch.models.model_list_gp_regression import ModelListGP
@@ -51,14 +53,14 @@ def draw_kernel_feature_paths(
 
 def _draw_kernel_feature_paths_fallback(
     num_inputs: int,
-    mean_module: Optional[Module],
+    mean_module: Module | None,
     covar_module: Kernel,
     sample_shape: Size,
     num_features: int = 1024,
     map_generator: TKernelFeatureMapGenerator = gen_kernel_features,
-    input_transform: Optional[TInputTransform] = None,
-    output_transform: Optional[TOutputTransform] = None,
-    weight_generator: Optional[Callable[[Size], Tensor]] = None,
+    input_transform: TInputTransform | None = None,
+    output_transform: TOutputTransform | None = None,
+    weight_generator: Callable[[Size], Tensor] | None = None,
 ) -> GeneralizedLinearPath:
 
     # Generate a kernel feature map
@@ -109,7 +111,7 @@ def _draw_kernel_feature_paths_ExactGP(
 @DrawKernelFeaturePaths.register(ModelListGP)
 def _draw_kernel_feature_paths_list(
     model: ModelListGP,
-    join: Optional[Callable[[list[Tensor]], Tensor]] = None,
+    join: Callable[[list[Tensor]], Tensor] | None = None,
     **kwargs: Any,
 ) -> PathList:
     paths = [draw_kernel_feature_paths(m, **kwargs) for m in model.models]
diff --git a/botorch/sampling/pathwise/update_strategies.py b/botorch/sampling/pathwise/update_strategies.py
index 7b8ca8ca3f..7d92e04a1a 100644
--- a/botorch/sampling/pathwise/update_strategies.py
+++ b/botorch/sampling/pathwise/update_strategies.py
@@ -6,9 +6,11 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 from types import NoneType
 
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 from botorch.models.approximate_gp import ApproximateGPyTorchModel
@@ -41,7 +43,7 @@
 def gaussian_update(
     model: GP,
     sample_values: Tensor,
-    likelihood: Optional[Likelihood] = DEFAULT,
+    likelihood: Likelihood | None = DEFAULT,
     **kwargs: Any,
 ) -> GeneralizedLinearPath:
     r"""Computes a Gaussian pathwise update in exact arithmetic:
@@ -74,9 +76,9 @@ def _gaussian_update_exact(
     points: Tensor,
     target_values: Tensor,
     sample_values: Tensor,
-    noise_covariance: Optional[Union[Tensor, LinearOperator]] = None,
-    scale_tril: Optional[Union[Tensor, LinearOperator]] = None,
-    input_transform: Optional[TInputTransform] = None,
+    noise_covariance: Tensor | LinearOperator | None = None,
+    scale_tril: Tensor | LinearOperator | None = None,
+    input_transform: TInputTransform | None = None,
 ) -> GeneralizedLinearPath:
     # Prepare Cholesky factor of `Cov(y, y)` and noise sample values as needed
     if isinstance(noise_covariance, (NoneType, ZeroLinearOperator)):
@@ -110,10 +112,10 @@ def _gaussian_update_ExactGP(
     likelihood: _GaussianLikelihoodBase,
     *,
     sample_values: Tensor,
-    target_values: Optional[Tensor] = None,
-    points: Optional[Tensor] = None,
-    noise_covariance: Optional[Union[Tensor, LinearOperator]] = None,
-    scale_tril: Optional[Union[Tensor, LinearOperator]] = None,
+    target_values: Tensor | None = None,
+    points: Tensor | None = None,
+    noise_covariance: Tensor | LinearOperator | None = None,
+    scale_tril: Tensor | LinearOperator | None = None,
 ) -> GeneralizedLinearPath:
     if points is None:
         (points,) = get_train_inputs(model, transformed=True)
@@ -138,7 +140,7 @@ def _gaussian_update_ExactGP(
 @GaussianUpdate.register(ApproximateGPyTorchModel, (Likelihood, NoneType))
 def _gaussian_update_ApproximateGPyTorchModel(
     model: ApproximateGPyTorchModel,
-    likelihood: Optional[Likelihood],
+    likelihood: Likelihood | None,
     **kwargs: Any,
 ) -> GeneralizedLinearPath:
     return GaussianUpdate(
@@ -148,7 +150,7 @@ def _gaussian_update_ApproximateGPyTorchModel(
 
 @GaussianUpdate.register(ApproximateGP, (Likelihood, NoneType))
 def _gaussian_update_ApproximateGP(
-    model: ApproximateGP, likelihood: Optional[Likelihood], **kwargs: Any
+    model: ApproximateGP, likelihood: Likelihood | None, **kwargs: Any
 ) -> GeneralizedLinearPath:
     return GaussianUpdate(model, model.variational_strategy, **kwargs)
 
@@ -159,9 +161,9 @@ def _gaussian_update_ApproximateGP_VariationalStrategy(
     _: VariationalStrategy,
     *,
     sample_values: Tensor,
-    target_values: Optional[Tensor] = None,
-    noise_covariance: Optional[Union[Tensor, LinearOperator]] = None,
-    input_transform: Optional[InputTransform] = None,
+    target_values: Tensor | None = None,
+    noise_covariance: Tensor | LinearOperator | None = None,
+    input_transform: InputTransform | None = None,
     **ignore: Any,
 ) -> GeneralizedLinearPath:
     # TODO: Account for jitter added by `psd_safe_cholesky`
diff --git a/botorch/sampling/pathwise/utils.py b/botorch/sampling/pathwise/utils.py
index c4ab9ab261..5935fa6f69 100644
--- a/botorch/sampling/pathwise/utils.py
+++ b/botorch/sampling/pathwise/utils.py
@@ -7,8 +7,8 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from collections.abc import Iterable
-from typing import Any, Callable, Optional, overload, Union
+from collections.abc import Callable, Iterable
+from typing import Any, overload, Union
 
 import torch
 from botorch.models.approximate_gp import SingleTaskVariationalGP
@@ -31,8 +31,8 @@
 class TransformedModuleMixin:
     r"""Mixin that wraps a module's __call__ method with optional transforms."""
 
-    input_transform: Optional[TInputTransform]
-    output_transform: Optional[TOutputTransform]
+    input_transform: TInputTransform | None
+    output_transform: TOutputTransform | None
 
     def __call__(self, values: Tensor, *args: Any, **kwargs: Any) -> Tensor:
         input_transform = getattr(self, "input_transform", None)
@@ -84,7 +84,7 @@ def forward(self, values: Tensor) -> Tensor:
 class SineCosineTransform(TensorTransform):
     r"""A transform that returns concatenated sine and cosine features."""
 
-    def __init__(self, scale: Optional[Tensor] = None):
+    def __init__(self, scale: Tensor | None = None):
         r"""Initializes a SineCosineTransform instance.
 
         Args:
@@ -143,7 +143,7 @@ class FeatureSelector(TensorTransform):
     r"""A transform that returns a subset of its input's features.
     along a given tensor dimension."""
 
-    def __init__(self, indices: Iterable[int], dim: Union[int, LongTensor] = -1):
+    def __init__(self, indices: Iterable[int], dim: int | LongTensor = -1):
         r"""Initializes a FeatureSelector instance.
 
         Args:
@@ -166,7 +166,7 @@ class OutcomeUntransformer(TensorTransform):
     def __init__(
         self,
         transform: OutcomeTransform,
-        num_outputs: Union[int, LongTensor],
+        num_outputs: int | LongTensor,
     ):
         r"""Initializes an OutcomeUntransformer instance.
 
@@ -193,12 +193,12 @@ def forward(self, values: Tensor) -> Tensor:
         return output_values.transpose(-2, -1)
 
 
-def get_input_transform(model: GPyTorchModel) -> Optional[InputTransform]:
+def get_input_transform(model: GPyTorchModel) -> InputTransform | None:
     r"""Returns a model's input_transform or None."""
     return getattr(model, "input_transform", None)
 
 
-def get_output_transform(model: GPyTorchModel) -> Optional[OutcomeUntransformer]:
+def get_output_transform(model: GPyTorchModel) -> OutcomeUntransformer | None:
     r"""Returns a wrapped version of a model's outcome_transform or None."""
     transform = getattr(model, "outcome_transform", None)
     if transform is None:
diff --git a/botorch/sampling/qmc.py b/botorch/sampling/qmc.py
index c0fb6858e2..e506a60c89 100644
--- a/botorch/sampling/qmc.py
+++ b/botorch/sampling/qmc.py
@@ -17,7 +17,6 @@
 from __future__ import annotations
 
 import math
-from typing import Optional
 
 import torch
 from torch import Tensor
@@ -37,7 +36,7 @@ class NormalQMCEngine:
     """
 
     def __init__(
-        self, d: int, seed: Optional[int] = None, inv_transform: bool = False
+        self, d: int, seed: int | None = None, inv_transform: bool = False
     ) -> None:
         r"""Engine for drawing qMC samples from a multivariate normal `N(0, I_d)`.
 
@@ -60,9 +59,9 @@ def __init__(
     def draw(
         self,
         n: int = 1,
-        out: Optional[Tensor] = None,
-        dtype: Optional[torch.dtype] = None,
-    ) -> Optional[Tensor]:
+        out: Tensor | None = None,
+        dtype: torch.dtype | None = None,
+    ) -> Tensor | None:
         r"""Draw `n` qMC samples from the standard Normal.
 
         Args:
@@ -116,7 +115,7 @@ def __init__(
         self,
         mean: Tensor,
         cov: Tensor,
-        seed: Optional[int] = None,
+        seed: int | None = None,
         inv_transform: bool = False,
     ) -> None:
         r"""Engine for qMC sampling from a multivariate Normal `N(\mu, \Sigma)`.
@@ -150,7 +149,7 @@ def __init__(
             eigval_root = eigval.clamp_min(0.0).sqrt()
             self._corr_matrix = (eigvec * eigval_root).transpose(-1, -2)
 
-    def draw(self, n: int = 1, out: Optional[Tensor] = None) -> Optional[Tensor]:
+    def draw(self, n: int = 1, out: Tensor | None = None) -> Tensor | None:
         r"""Draw `n` qMC samples from the multivariate Normal.
 
         Args:
diff --git a/botorch/test_functions/base.py b/botorch/test_functions/base.py
index b0dbf37f76..d65343315f 100644
--- a/botorch/test_functions/base.py
+++ b/botorch/test_functions/base.py
@@ -11,7 +11,6 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Optional, Union
 
 import torch
 from botorch.exceptions.errors import InputDataError
@@ -28,7 +27,7 @@ class BaseTestProblem(Module, ABC):
 
     def __init__(
         self,
-        noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
         negate: bool = False,
     ) -> None:
         r"""Base constructor for test functions.
@@ -96,7 +95,7 @@ class ConstrainedBaseTestProblem(BaseTestProblem, ABC):
 
     num_constraints: int
     _check_grad_at_opt: bool = False
-    constraint_noise_std: Union[None, float, list[float]] = None
+    constraint_noise_std: None | float | list[float] = None
 
     def evaluate_slack(self, X: Tensor, noise: bool = True) -> Tensor:
         r"""Evaluate the constraint slack on a set of points.
@@ -161,11 +160,11 @@ class MultiObjectiveTestProblem(BaseTestProblem, ABC):
 
     num_objectives: int
     _ref_point: list[float]
-    _max_hv: Optional[float] = None
+    _max_hv: float | None = None
 
     def __init__(
         self,
-        noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
         negate: bool = False,
     ) -> None:
         r"""Base constructor for multi-objective test functions.
diff --git a/botorch/test_functions/multi_fidelity.py b/botorch/test_functions/multi_fidelity.py
index dd2d057f0b..c7855c22b7 100644
--- a/botorch/test_functions/multi_fidelity.py
+++ b/botorch/test_functions/multi_fidelity.py
@@ -11,7 +11,6 @@
 from __future__ import annotations
 
 import math
-from typing import Optional
 
 import torch
 from botorch.test_functions.synthetic import SyntheticTestFunction
@@ -75,7 +74,7 @@ class AugmentedHartmann(SyntheticTestFunction):
     _optimizers = [(0.20169, 0.150011, 0.476874, 0.275332, 0.311652, 0.6573, 1.0)]
     _check_grad_at_opt = False
 
-    def __init__(self, noise_std: Optional[float] = None, negate: bool = False) -> None:
+    def __init__(self, noise_std: float | None = None, negate: bool = False) -> None:
         r"""
         Args:
             noise_std: Standard deviation of the observation noise.
@@ -127,7 +126,7 @@ class AugmentedRosenbrock(SyntheticTestFunction):
     _optimal_value = 0.0
 
     def __init__(
-        self, dim=3, noise_std: Optional[float] = None, negate: bool = False
+        self, dim=3, noise_std: float | None = None, negate: bool = False
     ) -> None:
         r"""
         Args:
diff --git a/botorch/test_functions/multi_objective.py b/botorch/test_functions/multi_objective.py
index 19b1140c97..1a21b138d4 100644
--- a/botorch/test_functions/multi_objective.py
+++ b/botorch/test_functions/multi_objective.py
@@ -76,7 +76,6 @@
 import math
 from abc import ABC, abstractmethod
 from math import pi
-from typing import Union
 
 import torch
 from botorch.exceptions.errors import UnsupportedError
@@ -118,7 +117,7 @@ class BraninCurrin(MultiObjectiveTestProblem):
 
     def __init__(
         self,
-        noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
         negate: bool = False,
     ) -> None:
         r"""
@@ -178,7 +177,7 @@ class DH(MultiObjectiveTestProblem, ABC):
     def __init__(
         self,
         dim: int,
-        noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
         negate: bool = False,
     ) -> None:
         r"""
@@ -338,7 +337,7 @@ def __init__(
         self,
         dim: int,
         num_objectives: int = 2,
-        noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
         negate: bool = False,
     ) -> None:
         r"""
@@ -606,7 +605,7 @@ class GMM(MultiObjectiveTestProblem):
 
     def __init__(
         self,
-        noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
         negate: bool = False,
         num_objectives: int = 2,
     ) -> None:
@@ -934,7 +933,7 @@ def __init__(
         self,
         dim: int,
         num_objectives: int = 2,
-        noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
         negate: bool = False,
     ) -> None:
         r"""
@@ -1244,8 +1243,8 @@ class ConstrainedBraninCurrin(BraninCurrin, ConstrainedBaseTestProblem):
 
     def __init__(
         self,
-        noise_std: Union[None, float, list[float]] = None,
-        constraint_noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
+        constraint_noise_std: None | float | list[float] = None,
         negate: bool = False,
     ) -> None:
         r"""
@@ -1356,8 +1355,8 @@ class MW7(MultiObjectiveTestProblem, ConstrainedBaseTestProblem):
     def __init__(
         self,
         dim: int,
-        noise_std: Union[None, float, list[float]] = None,
-        constraint_noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
+        constraint_noise_std: None | float | list[float] = None,
         negate: bool = False,
     ) -> None:
         r"""
diff --git a/botorch/test_functions/sensitivity_analysis.py b/botorch/test_functions/sensitivity_analysis.py
index 4c8a69f0db..51eae546ae 100644
--- a/botorch/test_functions/sensitivity_analysis.py
+++ b/botorch/test_functions/sensitivity_analysis.py
@@ -4,7 +4,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
-from typing import Optional
 
 import torch
 
@@ -25,7 +24,7 @@ class Ishigami(SyntheticTestFunction):
     """
 
     def __init__(
-        self, b: float = 0.1, noise_std: Optional[float] = None, negate: bool = False
+        self, b: float = 0.1, noise_std: float | None = None, negate: bool = False
     ) -> None:
         r"""
         Args:
@@ -126,7 +125,7 @@ def __init__(
         self,
         dim: int,
         a: list = None,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
     ) -> None:
         r"""
@@ -175,7 +174,7 @@ def optimal_sobol_indicies(self):
         for i in range(self.dim):
             vi.append(1 / (3 * ((1 + self.a[i]) ** 2)))
         self.vi = Tensor(vi)
-        self.V = torch.prod((1 + self.vi)) - 1
+        self.V = torch.prod(1 + self.vi) - 1
         self.si = self.vi / self.V
         si_t = []
         for i in range(self.dim):
@@ -208,7 +207,7 @@ class Morris(SyntheticTestFunction):
     Proposed to test sensitivity analysis methods
     """
 
-    def __init__(self, noise_std: Optional[float] = None, negate: bool = False) -> None:
+    def __init__(self, noise_std: float | None = None, negate: bool = False) -> None:
         r"""
         Args:
             noise_std: Standard deviation of observation noise.
diff --git a/botorch/test_functions/synthetic.py b/botorch/test_functions/synthetic.py
index 69630a5c08..2bea968ba5 100644
--- a/botorch/test_functions/synthetic.py
+++ b/botorch/test_functions/synthetic.py
@@ -48,7 +48,6 @@
 
 import math
 from abc import ABC
-from typing import Optional, Union
 
 import torch
 from botorch.exceptions.errors import InputDataError
@@ -60,15 +59,15 @@
 class SyntheticTestFunction(BaseTestProblem, ABC):
     r"""Base class for synthetic test functions."""
 
-    _optimal_value: Optional[float] = None
-    _optimizers: Optional[list[tuple[float, ...]]] = None
+    _optimal_value: float | None = None
+    _optimizers: list[tuple[float, ...]] | None = None
     num_objectives: int = 1
 
     def __init__(
         self,
-        noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -136,9 +135,9 @@ class Ackley(SyntheticTestFunction):
     def __init__(
         self,
         dim: int = 2,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -263,9 +262,9 @@ class DixonPrice(SyntheticTestFunction):
     def __init__(
         self,
         dim=2,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -332,9 +331,9 @@ class Griewank(SyntheticTestFunction):
     def __init__(
         self,
         dim=2,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -374,9 +373,9 @@ class Hartmann(SyntheticTestFunction):
     def __init__(
         self,
         dim=6,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -508,9 +507,9 @@ class Levy(SyntheticTestFunction):
     def __init__(
         self,
         dim=2,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -550,9 +549,9 @@ class Michalewicz(SyntheticTestFunction):
     def __init__(
         self,
         dim=2,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -610,9 +609,9 @@ class Powell(SyntheticTestFunction):
     def __init__(
         self,
         dim=4,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -646,9 +645,9 @@ class Rastrigin(SyntheticTestFunction):
     def __init__(
         self,
         dim=2,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -685,9 +684,9 @@ class Rosenbrock(SyntheticTestFunction):
     def __init__(
         self,
         dim=2,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -727,9 +726,9 @@ class Shekel(SyntheticTestFunction):
     def __init__(
         self,
         m: int = 10,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -793,9 +792,9 @@ class StyblinskiTang(SyntheticTestFunction):
     def __init__(
         self,
         dim=2,
-        noise_std: Optional[float] = None,
+        noise_std: float | None = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -839,10 +838,10 @@ class ConstrainedSyntheticTestFunction(
 
     def __init__(
         self,
-        noise_std: Union[None, float, list[float]] = None,
-        constraint_noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float | list[float] = None,
+        constraint_noise_std: None | float | list[float] = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -864,7 +863,7 @@ def __init__(
 
     def _validate_constraint_noise(
         self, constraint_noise_std
-    ) -> Union[None, float, list[float]]:
+    ) -> None | float | list[float]:
         """
         Validates that constraint_noise_std has length equal to
         the number of constraints, if given as a list
@@ -931,10 +930,10 @@ class ConstrainedHartmann(Hartmann, ConstrainedSyntheticTestFunction):
     def __init__(
         self,
         dim: int = 6,
-        noise_std: Union[None, float] = None,
-        constraint_noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float = None,
+        constraint_noise_std: None | float | list[float] = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
@@ -969,10 +968,10 @@ class ConstrainedHartmannSmooth(Hartmann, ConstrainedSyntheticTestFunction):
     def __init__(
         self,
         dim: int = 6,
-        noise_std: Union[None, float] = None,
-        constraint_noise_std: Union[None, float, list[float]] = None,
+        noise_std: None | float = None,
+        constraint_noise_std: None | float | list[float] = None,
         negate: bool = False,
-        bounds: Optional[list[tuple[float, float]]] = None,
+        bounds: list[tuple[float, float]] | None = None,
     ) -> None:
         r"""
         Args:
diff --git a/botorch/test_functions/utils.py b/botorch/test_functions/utils.py
index 0726e96512..94b538b224 100644
--- a/botorch/test_functions/utils.py
+++ b/botorch/test_functions/utils.py
@@ -7,7 +7,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 
@@ -15,7 +14,7 @@
 
 
 def round_nearest(
-    X: Tensor, increment: float, bounds: Optional[tuple[float, float]]
+    X: Tensor, increment: float, bounds: tuple[float, float] | None
 ) -> Tensor:
     r"""Rounds the input tensor to the nearest multiple of `increment`.
 
diff --git a/botorch/utils/constants.py b/botorch/utils/constants.py
index 2488828dcd..eb09e6ce4c 100644
--- a/botorch/utils/constants.py
+++ b/botorch/utils/constants.py
@@ -10,7 +10,6 @@
 
 from functools import lru_cache
 from numbers import Number
-from typing import Optional, Union
 
 import torch
 from torch import Tensor
@@ -18,10 +17,10 @@
 
 @lru_cache(maxsize=None)
 def get_constants(
-    values: Union[Number, Iterator[Number]],
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
-) -> Union[Tensor, tuple[Tensor, ...]]:
+    values: Number | Iterator[Number],
+    device: torch.device | None = None,
+    dtype: torch.dtype | None = None,
+) -> Tensor | tuple[Tensor, ...]:
     r"""Returns scalar-valued Tensors containing each of the given constants.
     Used to expedite tensor operations involving scalar arithmetic. Note that
     the returned Tensors should not be modified in-place."""
@@ -32,7 +31,7 @@ def get_constants(
 
 
 def get_constants_like(
-    values: Union[Number, Iterator[Number]],
+    values: Number | Iterator[Number],
     ref: Tensor,
-) -> Union[Tensor, Iterator[Tensor]]:
+) -> Tensor | Iterator[Tensor]:
     return get_constants(values, device=ref.device, dtype=ref.dtype)
diff --git a/botorch/utils/constraints.py b/botorch/utils/constraints.py
index 84d772ea2d..003c9bc91a 100644
--- a/botorch/utils/constraints.py
+++ b/botorch/utils/constraints.py
@@ -10,16 +10,17 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 from functools import partial
-from typing import Callable, Optional
 
 import torch
 from torch import Tensor
 
 
 def get_outcome_constraint_transforms(
-    outcome_constraints: Optional[tuple[Tensor, Tensor]]
-) -> Optional[list[Callable[[Tensor], Tensor]]]:
+    outcome_constraints: tuple[Tensor, Tensor] | None
+) -> list[Callable[[Tensor], Tensor]] | None:
     r"""Create outcome constraint callables from outcome constraint tensors.
 
     Args:
@@ -66,8 +67,8 @@ def _oc(a: Tensor, rhs: Tensor, Y: Tensor) -> Tensor:
 def get_monotonicity_constraints(
     d: int,
     descending: bool = False,
-    dtype: Optional[torch.dtype] = None,
-    device: Optional[torch.device] = None,
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
 ) -> tuple[Tensor, Tensor]:
     """Returns a system of linear inequalities `(A, b)` that generically encodes order
     constraints on the elements of a `d`-dimsensional space, i.e. `A @ x < b` implies
diff --git a/botorch/utils/context_managers.py b/botorch/utils/context_managers.py
index e9d2e9f76e..d399658258 100644
--- a/botorch/utils/context_managers.py
+++ b/botorch/utils/context_managers.py
@@ -10,10 +10,10 @@
 
 from __future__ import annotations
 
-from collections.abc import Generator, Iterable
+from collections.abc import Callable, Generator, Iterable
 
 from contextlib import contextmanager
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Any, NamedTuple
 
 from torch import device as Device, dtype as Dtype, Tensor
 from torch.nn import Module
@@ -21,8 +21,8 @@
 
 class TensorCheckpoint(NamedTuple):
     values: Tensor
-    device: Optional[Device] = None
-    dtype: Optional[Dtype] = None
+    device: Device | None = None
+    dtype: Dtype | None = None
 
 
 @contextmanager
@@ -49,7 +49,7 @@ def delattr_ctx(
 @contextmanager
 def parameter_rollback_ctx(
     parameters: dict[str, Tensor],
-    checkpoint: Optional[dict[str, TensorCheckpoint]] = None,
+    checkpoint: dict[str, TensorCheckpoint] | None = None,
     **tkwargs: Any,
 ) -> Generator[dict[str, TensorCheckpoint], None, None]:
     r"""Contextmanager that exits by rolling back a module's state_dict.
@@ -92,8 +92,8 @@ def parameter_rollback_ctx(
 @contextmanager
 def module_rollback_ctx(
     module: Module,
-    name_filter: Optional[Callable[[str], bool]] = None,
-    checkpoint: Optional[dict[str, TensorCheckpoint]] = None,
+    name_filter: Callable[[str], bool] | None = None,
+    checkpoint: dict[str, TensorCheckpoint] | None = None,
     **tkwargs: Any,
 ) -> Generator[dict[str, TensorCheckpoint], None, None]:
     r"""Contextmanager that exits by rolling back a module's state_dict.
@@ -141,7 +141,7 @@ def module_rollback_ctx(
 
 @contextmanager
 def zero_grad_ctx(
-    parameters: Union[dict[str, Tensor], Iterable[Tensor]],
+    parameters: dict[str, Tensor] | Iterable[Tensor],
     zero_on_enter: bool = True,
     zero_on_exit: bool = False,
 ) -> Generator[None, None, None]:
diff --git a/botorch/utils/datasets.py b/botorch/utils/datasets.py
index ac6337596c..7afa0c9ca4 100644
--- a/botorch/utils/datasets.py
+++ b/botorch/utils/datasets.py
@@ -9,7 +9,7 @@
 from __future__ import annotations
 
 import warnings
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from botorch.exceptions.errors import InputDataError, UnsupportedError
@@ -47,12 +47,12 @@ class SupervisedDataset:
 
     def __init__(
         self,
-        X: Union[BotorchContainer, Tensor],
-        Y: Union[BotorchContainer, Tensor],
+        X: BotorchContainer | Tensor,
+        Y: BotorchContainer | Tensor,
         *,
         feature_names: list[str],
         outcome_names: list[str],
-        Yvar: Union[BotorchContainer, Tensor, None] = None,
+        Yvar: BotorchContainer | Tensor | None = None,
         validate_init: bool = True,
     ) -> None:
         r"""Constructs a `SupervisedDataset`.
@@ -87,7 +87,7 @@ def Y(self) -> Tensor:
         return self._Y()
 
     @property
-    def Yvar(self) -> Optional[Tensor]:
+    def Yvar(self) -> Tensor | None:
         if self._Yvar is None or isinstance(self._Yvar, Tensor):
             return self._Yvar
         return self._Yvar()
@@ -159,9 +159,9 @@ class FixedNoiseDataset(SupervisedDataset):
 
     def __init__(
         self,
-        X: Union[BotorchContainer, Tensor],
-        Y: Union[BotorchContainer, Tensor],
-        Yvar: Union[BotorchContainer, Tensor],
+        X: BotorchContainer | Tensor,
+        Y: BotorchContainer | Tensor,
+        Yvar: BotorchContainer | Tensor,
         feature_names: list[str],
         outcome_names: list[str],
         validate_init: bool = True,
@@ -217,7 +217,7 @@ class RankingDataset(SupervisedDataset):
     def __init__(
         self,
         X: SliceContainer,
-        Y: Union[BotorchContainer, Tensor],
+        Y: BotorchContainer | Tensor,
         feature_names: list[str],
         outcome_names: list[str],
         validate_init: bool = True,
@@ -289,7 +289,7 @@ def __init__(
         self,
         datasets: list[SupervisedDataset],
         target_outcome_name: str,
-        task_feature_index: Optional[int] = None,
+        task_feature_index: int | None = None,
     ):
         """Construct a `MultiTaskDataset`.
 
@@ -323,7 +323,7 @@ def from_joint_dataset(
         dataset: SupervisedDataset,
         task_feature_index: int,
         target_task_value: int,
-        outcome_names_per_task: Optional[dict[int, str]] = None,
+        outcome_names_per_task: dict[int, str] | None = None,
     ) -> MultiTaskDataset:
         r"""Construct a `MultiTaskDataset` from a joint dataset that includes the
         data for all tasks with the task feature index.
@@ -461,7 +461,7 @@ def Y(self) -> Tensor:
         return torch.cat([ds.Y for ds in self.datasets.values()], dim=0)
 
     @property
-    def Yvar(self) -> Optional[Tensor]:
+    def Yvar(self) -> Tensor | None:
         """Concatenates Yvars of the datasets if they exist."""
         all_Yvars = [ds.Yvar for ds in self.datasets.values()]
         return None if all_Yvars[0] is None else torch.cat(all_Yvars, dim=0)
@@ -503,7 +503,7 @@ def __init__(
         self,
         datasets: list[SupervisedDataset],
         parameter_decomposition: dict[str, list[str]],
-        metric_decomposition: Optional[dict[str, list[str]]] = None,
+        metric_decomposition: dict[str, list[str]] | None = None,
     ):
         """Construct a `ContextualDataset`.
 
diff --git a/botorch/utils/dispatcher.py b/botorch/utils/dispatcher.py
index ee372b8d2f..5995c54ba7 100644
--- a/botorch/utils/dispatcher.py
+++ b/botorch/utils/dispatcher.py
@@ -6,8 +6,10 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 from inspect import getsource, getsourcefile
-from typing import Any, Callable, Optional
+from typing import Any
 
 from multipledispatch.dispatcher import (
     Dispatcher as MDDispatcher,
@@ -31,7 +33,7 @@ class Dispatcher(MDDispatcher):
     def __init__(
         self,
         name: str,
-        doc: Optional[str] = None,
+        doc: str | None = None,
         encoder: Callable[Any, type] = type,
     ) -> None:
         """
@@ -47,8 +49,8 @@ def __init__(
 
     def __getitem__(
         self,
-        args: Optional[Any] = None,
-        types: Optional[tuple[type]] = None,
+        args: Any | None = None,
+        types: tuple[type] | None = None,
     ) -> Callable:
         r"""Method lookup.
 
diff --git a/botorch/utils/feasible_volume.py b/botorch/utils/feasible_volume.py
index d14fb8fe46..664420937e 100644
--- a/botorch/utils/feasible_volume.py
+++ b/botorch/utils/feasible_volume.py
@@ -6,7 +6,8 @@
 
 from __future__ import annotations
 
-from typing import Callable, Optional
+from collections.abc import Callable
+
 
 import botorch.models.model as model
 import torch
@@ -20,7 +21,7 @@
 
 def get_feasible_samples(
     samples: Tensor,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
 ) -> tuple[Tensor, float]:
     r"""
     Checks which of the samples satisfy all of the inequality constraints.
@@ -62,7 +63,7 @@ def get_outcome_feasibility_probability(
     outcome_constraints: list[Callable[[Tensor], Tensor]],
     threshold: float = 0.1,
     nsample_outcome: int = 1000,
-    seed: Optional[int] = None,
+    seed: int | None = None,
 ) -> float:
     r"""
     Monte Carlo estimate of the feasible volume with respect to the outcome constraints.
@@ -120,14 +121,14 @@ def estimate_feasible_volume(
     bounds: Tensor,
     model: model.Model,
     outcome_constraints: list[Callable[[Tensor], Tensor]],
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
     nsample_feature: int = 1000,
     nsample_outcome: int = 1000,
     threshold: float = 0.1,
     verbose: bool = False,
-    seed: Optional[int] = None,
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
+    seed: int | None = None,
+    device: torch.device | None = None,
+    dtype: torch.dtype | None = None,
 ) -> tuple[float, float]:
     r"""
     Monte Carlo estimate of the feasible volume with respect
diff --git a/botorch/utils/gp_sampling.py b/botorch/utils/gp_sampling.py
index c85a73f7c1..65577252df 100644
--- a/botorch/utils/gp_sampling.py
+++ b/botorch/utils/gp_sampling.py
@@ -9,7 +9,6 @@
 import warnings
 from copy import deepcopy
 from math import pi
-from typing import Optional
 
 import torch
 from botorch.models.converter import batched_to_model_list
@@ -36,7 +35,7 @@ class GPDraw(Module):
     This does not yet support multi-output models.
     """
 
-    def __init__(self, model: Model, seed: Optional[int] = None) -> None:
+    def __init__(self, model: Model, seed: int | None = None) -> None:
         r"""Construct a GP function sampler.
 
         Args:
@@ -130,7 +129,7 @@ def __init__(
         kernel: Kernel,
         input_dim: int,
         num_rff_features: int,
-        sample_shape: Optional[torch.Size] = None,
+        sample_shape: torch.Size | None = None,
     ) -> None:
         r"""Initialize RandomFourierFeatures.
 
@@ -186,7 +185,7 @@ def _get_weights(
         base_kernel: Kernel,
         input_dim: int,
         num_rff_features: int,
-        sample_shape: Optional[torch.Size] = None,
+        sample_shape: torch.Size | None = None,
     ) -> Tensor:
         r"""Sample weights for RFF.
 
diff --git a/botorch/utils/multi_objective/box_decompositions/box_decomposition.py b/botorch/utils/multi_objective/box_decompositions/box_decomposition.py
index f6b8404810..44e5f7872d 100644
--- a/botorch/utils/multi_objective/box_decompositions/box_decomposition.py
+++ b/botorch/utils/multi_objective/box_decompositions/box_decomposition.py
@@ -18,7 +18,6 @@
 from __future__ import annotations
 
 from abc import ABC, abstractmethod
-from typing import Optional
 
 import torch
 from botorch.exceptions.errors import BotorchError
@@ -38,9 +37,7 @@ class BoxDecomposition(Module, ABC):
     Note: Internally, we store the negative reference point (minimization).
     """
 
-    def __init__(
-        self, ref_point: Tensor, sort: bool, Y: Optional[Tensor] = None
-    ) -> None:
+    def __init__(self, ref_point: Tensor, sort: bool, Y: Tensor | None = None) -> None:
         """Initialize BoxDecomposition.
 
         Args:
@@ -261,7 +258,7 @@ class FastPartitioning(BoxDecomposition, ABC):
     def __init__(
         self,
         ref_point: Tensor,
-        Y: Optional[Tensor] = None,
+        Y: Tensor | None = None,
     ) -> None:
         """
         Args:
diff --git a/botorch/utils/multi_objective/box_decompositions/box_decomposition_list.py b/botorch/utils/multi_objective/box_decompositions/box_decomposition_list.py
index 56f01cdaa9..eef5df241c 100644
--- a/botorch/utils/multi_objective/box_decompositions/box_decomposition_list.py
+++ b/botorch/utils/multi_objective/box_decompositions/box_decomposition_list.py
@@ -8,7 +8,6 @@
 
 from __future__ import annotations
 
-from typing import Union
 
 import torch
 from botorch.exceptions.errors import BotorchTensorDimensionError
@@ -94,7 +93,7 @@ def get_hypercell_bounds(self) -> Tensor:
 
         return torch.stack(bounds_list, dim=-3)
 
-    def update(self, Y: Union[list[Tensor], Tensor]) -> None:
+    def update(self, Y: list[Tensor] | Tensor) -> None:
         r"""Update the partitioning.
 
         Args:
diff --git a/botorch/utils/multi_objective/box_decompositions/non_dominated.py b/botorch/utils/multi_objective/box_decompositions/non_dominated.py
index 486d126654..682370913c 100644
--- a/botorch/utils/multi_objective/box_decompositions/non_dominated.py
+++ b/botorch/utils/multi_objective/box_decompositions/non_dominated.py
@@ -18,7 +18,6 @@
 
 from __future__ import annotations
 
-from typing import Optional
 
 import torch
 from botorch.utils.multi_objective.box_decompositions.box_decomposition import (
@@ -61,7 +60,7 @@ class NondominatedPartitioning(BoxDecomposition):
     def __init__(
         self,
         ref_point: Tensor,
-        Y: Optional[Tensor] = None,
+        Y: Tensor | None = None,
         alpha: float = 0.0,
     ) -> None:
         """Initialize NondominatedPartitioning.
@@ -370,7 +369,7 @@ class FastNondominatedPartitioning(FastPartitioning):
     def __init__(
         self,
         ref_point: Tensor,
-        Y: Optional[Tensor] = None,
+        Y: Tensor | None = None,
     ) -> None:
         """Initialize FastNondominatedPartitioning.
 
diff --git a/botorch/utils/multi_objective/box_decompositions/utils.py b/botorch/utils/multi_objective/box_decompositions/utils.py
index 2c65a103c0..191b90711b 100644
--- a/botorch/utils/multi_objective/box_decompositions/utils.py
+++ b/botorch/utils/multi_objective/box_decompositions/utils.py
@@ -6,7 +6,6 @@
 
 r"""Utilities for box decomposition algorithms."""
 
-from typing import Optional
 
 import torch
 from botorch.exceptions.errors import BotorchTensorDimensionError, UnsupportedError
@@ -41,7 +40,7 @@ def _pad_batch_pareto_frontier(
     Y: Tensor,
     ref_point: Tensor,
     is_pareto: bool = False,
-    feasibility_mask: Optional[Tensor] = None,
+    feasibility_mask: Tensor | None = None,
 ) -> Tensor:
     r"""Get a batch Pareto frontier by padding the pareto frontier with repeated points.
 
diff --git a/botorch/utils/multi_objective/hypervolume.py b/botorch/utils/multi_objective/hypervolume.py
index 310185f9ec..18df8322ed 100644
--- a/botorch/utils/multi_objective/hypervolume.py
+++ b/botorch/utils/multi_objective/hypervolume.py
@@ -22,11 +22,11 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Callable
 from copy import deepcopy
 
 from itertools import combinations
 
-from typing import Callable, Optional, Union
 
 import torch
 from botorch.acquisition.cached_cholesky import CachedCholeskyMCSamplerMixin
@@ -64,7 +64,7 @@
 
 def infer_reference_point(
     pareto_Y: Tensor,
-    max_ref_point: Optional[Tensor] = None,
+    max_ref_point: Tensor | None = None,
     scale: float = 0.1,
     scale_max_ref_point: bool = False,
 ) -> Tensor:
@@ -346,7 +346,7 @@ def __init__(
         m: int,
         dtype: torch.dtype,
         device: torch.device,
-        data: Optional[Tensor] = None,
+        data: Tensor | None = None,
     ) -> None:
         r"""Initialize MultiList.
 
@@ -483,7 +483,7 @@ def compute_q_subset_indices(
 
 
 def compute_subset_indices(
-    q: int, device: Optional[torch.device] = None
+    q: int, device: torch.device | None = None
 ) -> BufferDict[str, Tensor]:
     r"""Compute all (2^q - 1) distinct subsets of {1, ..., `q`}.
 
@@ -508,19 +508,19 @@ class NoisyExpectedHypervolumeMixin(CachedCholeskyMCSamplerMixin):
     def __init__(
         self,
         model: Model,
-        ref_point: Union[list[float], Tensor],
+        ref_point: list[float] | Tensor,
         X_baseline: Tensor,
-        sampler: Optional[MCSampler] = None,
-        objective: Optional[MCMultiOutputObjective] = None,
-        constraints: Optional[list[Callable[[Tensor], Tensor]]] = None,
-        X_pending: Optional[Tensor] = None,
+        sampler: MCSampler | None = None,
+        objective: MCMultiOutputObjective | None = None,
+        constraints: list[Callable[[Tensor], Tensor]] | None = None,
+        X_pending: Tensor | None = None,
         prune_baseline: bool = False,
         alpha: float = 0.0,
         cache_pending: bool = True,
         max_iep: int = 0,
         incremental_nehvi: bool = True,
         cache_root: bool = True,
-        marginalize_dim: Optional[int] = None,
+        marginalize_dim: int | None = None,
     ):
         """Initialize a mixin that contains functions for the batched Pareto-frontier
         partitioning used by the noisy hypervolume-improvement-based acquisition
@@ -651,7 +651,7 @@ def X_baseline(self) -> Tensor:
         r"""Return X_baseline augmented with pending points cached using CBD."""
         return self._X_baseline_and_pending
 
-    def _compute_initial_hvs(self, obj: Tensor, feas: Optional[Tensor] = None) -> None:
+    def _compute_initial_hvs(self, obj: Tensor, feas: Tensor | None = None) -> None:
         r"""Compute hypervolume dominated by f(X_baseline) under each sample.
 
         Args:
@@ -775,7 +775,7 @@ def _set_cell_bounds(self, num_new_points: int) -> None:
         self.register_buffer("cell_lower_bounds", cell_bounds[0])
         self.register_buffer("cell_upper_bounds", cell_bounds[1])
 
-    def set_X_pending(self, X_pending: Optional[Tensor] = None) -> None:
+    def set_X_pending(self, X_pending: Tensor | None = None) -> None:
         r"""Informs the acquisition function about pending design points.
 
         Args:
diff --git a/botorch/utils/multi_objective/scalarization.py b/botorch/utils/multi_objective/scalarization.py
index 4fa6dfde77..222d5e7001 100644
--- a/botorch/utils/multi_objective/scalarization.py
+++ b/botorch/utils/multi_objective/scalarization.py
@@ -16,7 +16,8 @@
 """
 from __future__ import annotations
 
-from typing import Callable, Optional
+from collections.abc import Callable
+
 
 import torch
 from botorch.exceptions.errors import BotorchTensorDimensionError, UnsupportedError
@@ -26,7 +27,7 @@
 
 def get_chebyshev_scalarization(
     weights: Tensor, Y: Tensor, alpha: float = 0.05
-) -> Callable[[Tensor, Optional[Tensor]], Tensor]:
+) -> Callable[[Tensor, Tensor | None], Tensor]:
     r"""Construct an augmented Chebyshev scalarization.
 
     The augmented Chebyshev scalarization is given by
@@ -75,7 +76,7 @@ def get_chebyshev_scalarization(
     elif Y.ndim > 2:
         raise NotImplementedError("Batched Y is not currently supported.")
 
-    def chebyshev_obj(Y: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def chebyshev_obj(Y: Tensor, X: Tensor | None = None) -> Tensor:
         product = weights * Y
         return product.max(dim=-1).values + alpha * product.sum(dim=-1)
 
@@ -89,7 +90,7 @@ def chebyshev_obj(Y: Tensor, X: Optional[Tensor] = None) -> Tensor:
             )
         # If there are no observations, we do not need to normalize the objectives
 
-        def obj(Y: Tensor, X: Optional[Tensor] = None) -> Tensor:
+        def obj(Y: Tensor, X: Tensor | None = None) -> Tensor:
             # multiply the scalarization by -1, so that the scalarization should
             # be maximized
             return -chebyshev_obj(Y=-Y)
@@ -98,7 +99,7 @@ def obj(Y: Tensor, X: Optional[Tensor] = None) -> Tensor:
     # Set the bounds to be [min(Y_m), max(Y_m)], for each objective m.
     Y_bounds = torch.stack([Y.min(dim=-2).values, Y.max(dim=-2).values])
 
-    def obj(Y: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def obj(Y: Tensor, X: Tensor | None = None) -> Tensor:
         # scale to [0,1]
         Y_normalized = normalize(-Y, bounds=Y_bounds)
         # If minimizing an objective, convert Y_normalized values to [-1,0],
diff --git a/botorch/utils/objective.py b/botorch/utils/objective.py
index 7fc89210da..d92b1cac5a 100644
--- a/botorch/utils/objective.py
+++ b/botorch/utils/objective.py
@@ -10,7 +10,8 @@
 
 from __future__ import annotations
 
-from typing import Callable, Optional, Union
+from collections.abc import Callable
+
 
 import torch
 from botorch.utils.safe_math import log_fatmoid, logexpit
@@ -19,8 +20,8 @@
 
 
 def get_objective_weights_transform(
-    weights: Optional[Tensor],
-) -> Callable[[Tensor, Optional[Tensor]], Tensor]:
+    weights: Tensor | None,
+) -> Callable[[Tensor, Tensor | None], Tensor]:
     r"""Create a linear objective callable from a set of weights.
 
     Create a callable mapping a Tensor of size `b x q x m` and an (optional)
@@ -42,7 +43,7 @@ def get_objective_weights_transform(
         >>> transform = get_objective_weights_transform(weights)
     """
 
-    def _objective(Y: Tensor, X: Optional[Tensor] = None):
+    def _objective(Y: Tensor, X: Tensor | None = None):
         r"""Evaluate objective.
 
         Note: einsum multiples Y by weights and sums over the `m`-dimension.
@@ -67,7 +68,7 @@ def apply_constraints_nonnegative_soft(
     obj: Tensor,
     constraints: list[Callable[[Tensor], Tensor]],
     samples: Tensor,
-    eta: Union[Tensor, float],
+    eta: Tensor | float,
 ) -> Tensor:
     r"""Applies constraints to a non-negative objective.
 
@@ -99,9 +100,9 @@ def apply_constraints_nonnegative_soft(
 
 
 def compute_feasibility_indicator(
-    constraints: Optional[list[Callable[[Tensor], Tensor]]],
+    constraints: list[Callable[[Tensor], Tensor]] | None,
     samples: Tensor,
-    marginalize_dim: Optional[int] = None,
+    marginalize_dim: int | None = None,
 ) -> Tensor:
     r"""Computes the feasibility of a list of constraints given posterior samples.
 
@@ -134,7 +135,7 @@ def compute_feasibility_indicator(
 def compute_smoothed_feasibility_indicator(
     constraints: list[Callable[[Tensor], Tensor]],
     samples: Tensor,
-    eta: Union[Tensor, float],
+    eta: Tensor | float,
     log: bool = False,
     fat: bool = False,
 ) -> Tensor:
@@ -185,7 +186,7 @@ def apply_constraints(
     constraints: list[Callable[[Tensor], Tensor]],
     samples: Tensor,
     infeasible_cost: float,
-    eta: Union[Tensor, float] = 1e-3,
+    eta: Tensor | float = 1e-3,
 ) -> Tensor:
     r"""Apply constraints using an infeasible_cost `M` for negative objectives.
 
diff --git a/botorch/utils/probability/bvn.py b/botorch/utils/probability/bvn.py
index 1499a2a3ac..a0e8183da6 100644
--- a/botorch/utils/probability/bvn.py
+++ b/botorch/utils/probability/bvn.py
@@ -19,7 +19,6 @@
 from __future__ import annotations
 
 from math import pi as _pi
-from typing import Optional
 
 import torch
 from botorch.exceptions import UnsupportedError
@@ -134,7 +133,7 @@ def bvnu(r: Tensor, h: Tensor, k: Tensor) -> Tensor:
 
 
 def _bvnu_polar(
-    r: Tensor, h: Tensor, k: Tensor, num_points: Optional[int] = None
+    r: Tensor, h: Tensor, k: Tensor, num_points: int | None = None
 ) -> Tensor:
     r"""Solves for `P(x > h, y > k)` by integrating in polar coordinates as
 
@@ -242,7 +241,7 @@ def bvnmom(
     yl: Tensor,
     xu: Tensor,
     yu: Tensor,
-    p: Optional[Tensor] = None,
+    p: Tensor | None = None,
 ) -> tuple[Tensor, Tensor]:
     r"""Computes the expected values of truncated, bivariate normal random variables.
 
diff --git a/botorch/utils/probability/lin_ess.py b/botorch/utils/probability/lin_ess.py
index 2b524bad92..120c44ffa0 100644
--- a/botorch/utils/probability/lin_ess.py
+++ b/botorch/utils/probability/lin_ess.py
@@ -35,7 +35,6 @@
 from __future__ import annotations
 
 import math
-from typing import Optional, Union
 
 import torch
 from botorch.utils.sampling import PolytopeSampler
@@ -55,13 +54,13 @@ class LinearEllipticalSliceSampler(PolytopeSampler):
 
     def __init__(
         self,
-        inequality_constraints: Optional[tuple[Tensor, Tensor]] = None,
-        bounds: Optional[Tensor] = None,
-        interior_point: Optional[Tensor] = None,
-        fixed_indices: Optional[Union[list[int], Tensor]] = None,
-        mean: Optional[Tensor] = None,
-        covariance_matrix: Optional[Union[Tensor, LinearOperator]] = None,
-        covariance_root: Optional[Union[Tensor, LinearOperator]] = None,
+        inequality_constraints: tuple[Tensor, Tensor] | None = None,
+        bounds: Tensor | None = None,
+        interior_point: Tensor | None = None,
+        fixed_indices: list[int] | Tensor | None = None,
+        mean: Tensor | None = None,
+        covariance_matrix: Tensor | LinearOperator | None = None,
+        covariance_root: Tensor | LinearOperator | None = None,
         check_feasibility: bool = False,
         burnin: int = 0,
         thinning: int = 0,
@@ -183,12 +182,12 @@ def _fixed_features_initialization(
         self,
         A: Tensor,
         b: Tensor,
-        interior_point: Optional[Tensor],
-        fixed_indices: Union[list[int], Tensor],
-        mean: Optional[Tensor],
-        covariance_matrix: Optional[Tensor],
-        covariance_root: Optional[Tensor],
-    ) -> tuple[Optional[Tensor], Optional[Tensor]]:
+        interior_point: Tensor | None,
+        fixed_indices: list[int] | Tensor,
+        mean: Tensor | None,
+        covariance_matrix: Tensor | None,
+        covariance_root: Tensor | None,
+    ) -> tuple[Tensor | None, Tensor | None]:
         """Modifies the constraint system (A, b) due to fixed indices and assigns
         the modified constraints system to `self._Az`, `self._bz`. NOTE: Needs to be
         called prior to `self._standardization_initialization` in the constructor.
@@ -493,7 +492,7 @@ def _unstandardize(self, z: Tensor) -> Tensor:
 
 
 def get_index_tensors(
-    fixed_indices: Union[list[int], Tensor], d: int
+    fixed_indices: list[int] | Tensor, d: int
 ) -> tuple[Tensor, Tensor]:
     """Converts `fixed_indices` to a `d`-dim integral Tensor that is True at indices
     that are contained in `fixed_indices` and False otherwise.
diff --git a/botorch/utils/probability/linalg.py b/botorch/utils/probability/linalg.py
index 080cfec220..331b59913d 100644
--- a/botorch/utils/probability/linalg.py
+++ b/botorch/utils/probability/linalg.py
@@ -10,7 +10,7 @@
 
 from dataclasses import dataclass, InitVar
 from itertools import chain
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from botorch.utils.probability.utils import swap_along_dim_
@@ -34,9 +34,9 @@ def block_matrix_concat(blocks: Sequence[Sequence[Tensor]]) -> Tensor:
 def augment_cholesky(
     Laa: Tensor,
     Kbb: Tensor,
-    Kba: Optional[Tensor] = None,
-    Lba: Optional[Tensor] = None,
-    jitter: Optional[float] = None,
+    Kba: Tensor | None = None,
+    Lba: Tensor | None = None,
+    jitter: float | None = None,
 ) -> Tensor:
     r"""Computes the Cholesky factor of a block matrix `K = [[Kaa, Kab], [Kba, Kbb]]`
     based on a precomputed Cholesky factor `Kaa = Laa Laa^T`.
@@ -77,7 +77,7 @@ class PivotedCholesky:
     step: int
     tril: Tensor
     perm: LongTensor
-    diag: Optional[Tensor] = None
+    diag: Tensor | None = None
     validate_init: InitVar[bool] = True
 
     def __post_init__(self, validate_init: bool = True):
diff --git a/botorch/utils/probability/mvnxpb.py b/botorch/utils/probability/mvnxpb.py
index d125957f2c..02ff598ae6 100644
--- a/botorch/utils/probability/mvnxpb.py
+++ b/botorch/utils/probability/mvnxpb.py
@@ -24,7 +24,7 @@
 
 from __future__ import annotations
 
-from typing import Any, Optional, TypedDict
+from typing import Any, TypedDict
 from warnings import warn
 
 import torch
@@ -56,7 +56,7 @@ class mvnxpbState(TypedDict):
     piv_chol: PivotedCholesky
     plug_ins: Tensor
     log_prob: Tensor
-    log_prob_extra: Optional[Tensor]
+    log_prob_extra: Tensor | None
 
 
 class MVNXPB:
@@ -101,7 +101,7 @@ def __init__(self, covariance_matrix: Tensor, bounds: Tensor) -> None:
             batch_shape + [n], float("nan"), device=device, dtype=dtype
         )
         self.log_prob = torch.zeros(batch_shape, device=device, dtype=dtype)
-        self.log_prob_extra: Optional[Tensor] = None
+        self.log_prob_extra: Tensor | None = None
 
     @classmethod
     def build(
@@ -112,7 +112,7 @@ def build(
         piv_chol: PivotedCholesky,
         plug_ins: Tensor,
         log_prob: Tensor,
-        log_prob_extra: Optional[Tensor] = None,
+        log_prob_extra: Tensor | None = None,
     ) -> MVNXPB:
         r"""Creates an MVNXPB instance from raw arguments. Unlike MVNXPB.__init__,
         this methods does not preprocess or copy terms.
@@ -137,7 +137,7 @@ def build(
         new.log_prob_extra = log_prob_extra
         return new
 
-    def solve(self, num_steps: Optional[int] = None, eps: float = 1e-10) -> Tensor:
+    def solve(self, num_steps: int | None = None, eps: float = 1e-10) -> Tensor:
         r"""Runs the MVNXPB solver instance for a fixed number of steps.
 
         Calculates a bivariate conditional approximation to P(X \in bounds), where
@@ -242,7 +242,7 @@ def solve(self, num_steps: Optional[int] = None, eps: float = 1e-10) -> Tensor:
 
         return self.log_prob
 
-    def select_pivot(self) -> Optional[LongTensor]:
+    def select_pivot(self) -> LongTensor | None:
         r"""GGE variable prioritization strategy from [Gibson1994monte]_.
 
         Returns the index of the random variable least likely to satisfy its bounds
@@ -342,8 +342,8 @@ def augment(
         bounds: Tensor,
         cross_covariance_matrix: Tensor,
         disable_pivoting: bool = False,
-        jitter: Optional[float] = None,
-        max_tries: Optional[int] = None,
+        jitter: float | None = None,
+        max_tries: int | None = None,
     ) -> MVNXPB:
         r"""Augment an `n`-dimensional MVNXPB instance to include `m` additional random
         variables.
diff --git a/botorch/utils/probability/truncated_multivariate_normal.py b/botorch/utils/probability/truncated_multivariate_normal.py
index 5523e3d2c3..7907008199 100644
--- a/botorch/utils/probability/truncated_multivariate_normal.py
+++ b/botorch/utils/probability/truncated_multivariate_normal.py
@@ -8,7 +8,6 @@
 
 from collections.abc import Sequence
 
-from typing import Optional
 
 import torch
 from botorch.utils.probability.lin_ess import LinearEllipticalSliceSampler
@@ -22,13 +21,13 @@ class TruncatedMultivariateNormal(MultivariateNormal):
     def __init__(
         self,
         loc: Tensor,
-        covariance_matrix: Optional[Tensor] = None,
-        precision_matrix: Optional[Tensor] = None,
-        scale_tril: Optional[Tensor] = None,
+        covariance_matrix: Tensor | None = None,
+        precision_matrix: Tensor | None = None,
+        scale_tril: Tensor | None = None,
         bounds: Tensor = None,
-        solver: Optional[MVNXPB] = None,
-        sampler: Optional[LinearEllipticalSliceSampler] = None,
-        validate_args: Optional[bool] = None,
+        solver: MVNXPB | None = None,
+        sampler: LinearEllipticalSliceSampler | None = None,
+        validate_args: bool | None = None,
     ):
         r"""Initializes an instance of a TruncatedMultivariateNormal distribution.
 
diff --git a/botorch/utils/probability/unified_skew_normal.py b/botorch/utils/probability/unified_skew_normal.py
index b042cc5ec4..d13715f424 100644
--- a/botorch/utils/probability/unified_skew_normal.py
+++ b/botorch/utils/probability/unified_skew_normal.py
@@ -9,7 +9,6 @@
 from collections.abc import Sequence
 
 from inspect import getmembers
-from typing import Optional, Union
 
 import torch
 from botorch.utils.probability.linalg import augment_cholesky, block_matrix_concat
@@ -32,8 +31,8 @@ def __init__(
         self,
         trunc: TruncatedMultivariateNormal,
         gauss: MultivariateNormal,
-        cross_covariance_matrix: Union[Tensor, LinearOperator],
-        validate_args: Optional[bool] = None,
+        cross_covariance_matrix: Tensor | LinearOperator,
+        validate_args: bool | None = None,
     ):
         r"""Unified Skew Normal distribution of `Y | a < X < b` for jointly Gaussian
         random vectors `X ∈ R^m` and `Y ∈ R^n`.
diff --git a/botorch/utils/probability/utils.py b/botorch/utils/probability/utils.py
index 2f29b146f9..ddf01ff785 100644
--- a/botorch/utils/probability/utils.py
+++ b/botorch/utils/probability/utils.py
@@ -7,12 +7,12 @@
 from __future__ import annotations
 
 import math
-from collections.abc import Iterable, Iterator
+from collections.abc import Callable, Iterable, Iterator
 
 from functools import lru_cache
 from math import pi
 from numbers import Number
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 from botorch.utils.safe_math import logdiffexp
@@ -80,10 +80,10 @@ def case_dispatcher(
 
 @lru_cache(maxsize=None)
 def get_constants(
-    values: Union[Number, Iterator[Number]],
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
-) -> Union[Tensor, tuple[Tensor, ...]]:
+    values: Number | Iterator[Number],
+    device: torch.device | None = None,
+    dtype: torch.dtype | None = None,
+) -> Tensor | tuple[Tensor, ...]:
     r"""Returns scalar-valued Tensors containing each of the given constants.
     Used to expedite tensor operations involving scalar arithmetic. Note that
     the returned Tensors should not be modified in-place."""
@@ -94,16 +94,16 @@ def get_constants(
 
 
 def get_constants_like(
-    values: Union[Number, Iterator[Number]],
+    values: Number | Iterator[Number],
     ref: Tensor,
-) -> Union[Tensor, Iterator[Tensor]]:
+) -> Tensor | Iterator[Tensor]:
     return get_constants(values, device=ref.device, dtype=ref.dtype)
 
 
 def gen_positional_indices(
     shape: torch.Size,
     dim: int,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
 ) -> Iterator[torch.LongTensor]:
     ndim = len(shape)
     _dim = ndim + dim if dim < 0 else dim
@@ -119,7 +119,7 @@ def gen_positional_indices(
 def build_positional_indices(
     shape: torch.Size,
     dim: int,
-    device: Optional[torch.device] = None,
+    device: torch.device | None = None,
 ) -> LongTensor:
     return sum(gen_positional_indices(shape=shape, dim=dim, device=device))
 
@@ -259,10 +259,10 @@ def log_prob_normal_in(a: Tensor, b: Tensor) -> Tensor:
 
 def swap_along_dim_(
     values: Tensor,
-    i: Union[int, LongTensor],
-    j: Union[int, LongTensor],
+    i: int | LongTensor,
+    j: int | LongTensor,
     dim: int,
-    buffer: Optional[Tensor] = None,
+    buffer: Tensor | None = None,
 ) -> Tensor:
     r"""Swaps Tensor slices in-place along dimension `dim`.
 
diff --git a/botorch/utils/safe_math.py b/botorch/utils/safe_math.py
index 8b4ce7b957..ce9dd6e01b 100644
--- a/botorch/utils/safe_math.py
+++ b/botorch/utils/safe_math.py
@@ -16,8 +16,8 @@
 from __future__ import annotations
 
 import math
+from collections.abc import Callable
 
-from typing import Callable, Union
 
 import torch
 from botorch.exceptions import UnsupportedError
@@ -121,9 +121,7 @@ def logdiffexp(log_a: Tensor, log_b: Tensor) -> Tensor:
     return log_b + log1mexp(log_a - log_b.masked_fill(is_inf, 0.0))
 
 
-def logsumexp(
-    x: Tensor, dim: Union[int, tuple[int, ...]], keepdim: bool = False
-) -> Tensor:
+def logsumexp(x: Tensor, dim: int | tuple[int, ...], keepdim: bool = False) -> Tensor:
     """Version of logsumexp that has a well-behaved backward pass when
     x contains infinities.
 
@@ -149,7 +147,7 @@ def logsumexp(
 def _inf_max_helper(
     max_fun: Callable[[Tensor], Tensor],
     x: Tensor,
-    dim: Union[int, tuple[int, ...]],
+    dim: int | tuple[int, ...],
     keepdim: bool,
 ) -> Tensor:
     """Helper function that generalizes the treatment of infinities for approximations
@@ -187,7 +185,7 @@ def _inf_max_helper(
     return res if keepdim else res.sum(dim=dim)
 
 
-def _any(x: Tensor, dim: Union[int, tuple[int, ...]], keepdim: bool = False) -> Tensor:
+def _any(x: Tensor, dim: int | tuple[int, ...], keepdim: bool = False) -> Tensor:
     """Extension of torch.any, which supports reducing over tuples of dimensions.
 
     Args:
@@ -206,9 +204,7 @@ def _any(x: Tensor, dim: Union[int, tuple[int, ...]], keepdim: bool = False) ->
     return x if keepdim else x.squeeze(dim)
 
 
-def logmeanexp(
-    X: Tensor, dim: Union[int, tuple[int, ...]], keepdim: bool = False
-) -> Tensor:
+def logmeanexp(X: Tensor, dim: int | tuple[int, ...], keepdim: bool = False) -> Tensor:
     """Computes `log(mean(exp(X), dim=dim, keepdim=keepdim))`.
 
     Args:
@@ -223,7 +219,7 @@ def logmeanexp(
     return logsumexp(X, dim=dim, keepdim=keepdim) - math.log(n)
 
 
-def log_softplus(x: Tensor, tau: Union[float, Tensor] = TAU) -> Tensor:
+def log_softplus(x: Tensor, tau: float | Tensor = TAU) -> Tensor:
     """Computes the logarithm of the softplus function with high numerical accuracy.
 
     Args:
@@ -249,9 +245,9 @@ def log_softplus(x: Tensor, tau: Union[float, Tensor] = TAU) -> Tensor:
 
 def smooth_amax(
     X: Tensor,
-    dim: Union[int, tuple[int, ...]] = -1,
+    dim: int | tuple[int, ...] = -1,
     keepdim: bool = False,
-    tau: Union[float, Tensor] = 1.0,
+    tau: float | Tensor = 1.0,
 ) -> Tensor:
     """Computes a smooth approximation to `max(X, dim=dim)`, i.e the maximum value of
     `X` over dimension `dim`, using the logarithm of the `l_(1/tau)` norm of `exp(X)`.
@@ -275,9 +271,9 @@ def smooth_amax(
 
 def smooth_amin(
     X: Tensor,
-    dim: Union[int, tuple[int, ...]] = -1,
+    dim: int | tuple[int, ...] = -1,
     keepdim: bool = False,
-    tau: Union[float, Tensor] = 1.0,
+    tau: float | Tensor = 1.0,
 ) -> Tensor:
     """A smooth approximation to `min(X, dim=dim)`, similar to `smooth_amax`."""
     return -smooth_amax(X=-X, dim=dim, keepdim=keepdim, tau=tau)
@@ -290,7 +286,7 @@ def check_dtype_float32_or_float64(X: Tensor) -> None:
         )
 
 
-def log_fatplus(x: Tensor, tau: Union[float, Tensor] = TAU) -> Tensor:
+def log_fatplus(x: Tensor, tau: float | Tensor = TAU) -> Tensor:
     """Computes the logarithm of the fat-tailed softplus.
 
     NOTE: Separated out in case the complexity of the `log` implementation increases
@@ -299,7 +295,7 @@ def log_fatplus(x: Tensor, tau: Union[float, Tensor] = TAU) -> Tensor:
     return fatplus(x, tau=tau).log()
 
 
-def fatplus(x: Tensor, tau: Union[float, Tensor] = TAU) -> Tensor:
+def fatplus(x: Tensor, tau: float | Tensor = TAU) -> Tensor:
     """Computes a fat-tailed approximation to `ReLU(x) = max(x, 0)` by linearly
     combining a regular softplus function and the density function of a Cauchy
     distribution. The coefficient `alpha` of the Cauchy density is chosen to guarantee
@@ -322,9 +318,9 @@ def _fatplus(x: Tensor) -> Tensor:
 
 def fatmax(
     x: Tensor,
-    dim: Union[int, tuple[int, ...]],
+    dim: int | tuple[int, ...],
     keepdim: bool = False,
-    tau: Union[float, Tensor] = TAU,
+    tau: float | Tensor = TAU,
     alpha: float = ALPHA,
 ) -> Tensor:
     """Computes a smooth approximation to amax(X, dim=dim) with a fat tail.
@@ -344,9 +340,7 @@ def fatmax(
         A Tensor of smooth approximations to `amax(X, dim=dim)` with a fat tail.
     """
 
-    def max_fun(
-        x: Tensor, dim: Union[int, tuple[int, ...]], keepdim: bool = False
-    ) -> Tensor:
+    def max_fun(x: Tensor, dim: int | tuple[int, ...], keepdim: bool = False) -> Tensor:
         return tau * _pareto(-x / tau, alpha=alpha).sum(dim=dim, keepdim=keepdim).log()
 
     return _inf_max_helper(max_fun=max_fun, x=x, dim=dim, keepdim=keepdim)
@@ -354,9 +348,9 @@ def max_fun(
 
 def fatmin(
     x: Tensor,
-    dim: Union[int, tuple[int, ...]],
+    dim: int | tuple[int, ...],
     keepdim: bool = False,
-    tau: Union[float, Tensor] = TAU,
+    tau: float | Tensor = TAU,
     alpha: float = ALPHA,
 ) -> Tensor:
     """Computes a smooth approximation to amin(X, dim=dim) with a fat tail.
@@ -379,7 +373,7 @@ def fatmin(
 
 
 def fatmaximum(
-    a: Tensor, b: Tensor, tau: Union[float, Tensor] = TAU, alpha: float = ALPHA
+    a: Tensor, b: Tensor, tau: float | Tensor = TAU, alpha: float = ALPHA
 ) -> Tensor:
     """Computes a smooth approximation to torch.maximum(a, b) with a fat tail.
 
@@ -402,7 +396,7 @@ def fatmaximum(
 
 
 def fatminimum(
-    a: Tensor, b: Tensor, tau: Union[float, Tensor] = TAU, alpha: float = ALPHA
+    a: Tensor, b: Tensor, tau: float | Tensor = TAU, alpha: float = ALPHA
 ) -> Tensor:
     """Computes a smooth approximation to torch.minimum(a, b) with a fat tail.
 
@@ -419,14 +413,14 @@ def fatminimum(
     return -fatmaximum(-a, -b, tau=tau, alpha=alpha)
 
 
-def log_fatmoid(X: Tensor, tau: Union[float, Tensor] = 1.0) -> Tensor:
+def log_fatmoid(X: Tensor, tau: float | Tensor = 1.0) -> Tensor:
     """Computes the logarithm of the fatmoid. Separated out in case the implementation
     of the logarithm becomes more complex in the future to ensure numerical stability.
     """
     return fatmoid(X, tau=tau).log()
 
 
-def fatmoid(X: Tensor, tau: Union[float, Tensor] = 1.0) -> Tensor:
+def fatmoid(X: Tensor, tau: float | Tensor = 1.0) -> Tensor:
     """Computes a twice continuously differentiable approximation to the Heaviside
     step function with a fat tail, i.e. `O(1 / x^2)` as `x` goes to -inf.
 
diff --git a/botorch/utils/sampling.py b/botorch/utils/sampling.py
index 38fdf3ffc2..bc48c930cd 100644
--- a/botorch/utils/sampling.py
+++ b/botorch/utils/sampling.py
@@ -21,7 +21,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Generator, Iterable
 from contextlib import contextmanager
-from typing import Any, Optional, TYPE_CHECKING, Union
+from typing import Any, TYPE_CHECKING
 
 import numpy as np
 import scipy
@@ -40,7 +40,7 @@
 
 
 @contextmanager
-def manual_seed(seed: Optional[int] = None) -> Generator[None, None, None]:
+def manual_seed(seed: int | None = None) -> Generator[None, None, None]:
     r"""Contextmanager for manual setting the torch.random seed.
 
     Args:
@@ -67,8 +67,8 @@ def draw_sobol_samples(
     bounds: Tensor,
     n: int,
     q: int,
-    batch_shape: Optional[Union[Iterable[int], torch.Size]] = None,
-    seed: Optional[int] = None,
+    batch_shape: Iterable[int] | torch.Size | None = None,
+    seed: int | None = None,
 ) -> Tensor:
     r"""Draw qMC samples from the box defined by bounds.
 
@@ -108,9 +108,9 @@ def draw_sobol_samples(
 def draw_sobol_normal_samples(
     d: int,
     n: int,
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
-    seed: Optional[int] = None,
+    device: torch.device | None = None,
+    dtype: torch.dtype | None = None,
+    seed: int | None = None,
 ) -> Tensor:
     r"""Draw qMC samples from a multi-variate standard normal N(0, I_d).
 
@@ -141,9 +141,9 @@ def sample_hypersphere(
     d: int,
     n: int = 1,
     qmc: bool = False,
-    seed: Optional[int] = None,
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
+    seed: int | None = None,
+    device: torch.device | None = None,
+    dtype: torch.dtype | None = None,
 ) -> Tensor:
     r"""Sample uniformly from a unit d-sphere.
 
@@ -179,9 +179,9 @@ def sample_simplex(
     d: int,
     n: int = 1,
     qmc: bool = False,
-    seed: Optional[int] = None,
-    device: Optional[torch.device] = None,
-    dtype: Optional[torch.dtype] = None,
+    seed: int | None = None,
+    device: torch.device | None = None,
+    dtype: torch.dtype | None = None,
 ) -> Tensor:
     r"""Sample uniformly from a d-simplex.
 
@@ -223,7 +223,7 @@ def sample_polytope(
     n: int = 10000,
     n0: int = 100,
     n_thinning: int = 1,
-    seed: Optional[int] = None,
+    seed: int | None = None,
 ) -> Tensor:
     r"""
     Hit and run sampler from uniform sampling points from a polytope,
@@ -313,8 +313,8 @@ def batched_multinomial(
     weights: Tensor,
     num_samples: int,
     replacement: bool = False,
-    generator: Optional[torch.Generator] = None,
-    out: Optional[Tensor] = None,
+    generator: torch.Generator | None = None,
+    out: Tensor | None = None,
 ) -> LongTensor:
     r"""Sample from multinomial with an arbitrary number of batch dimensions.
 
@@ -376,8 +376,8 @@ def _convert_bounds_to_inequality_constraints(bounds: Tensor) -> tuple[Tensor, T
 def find_interior_point(
     A: np.ndarray,
     b: np.ndarray,
-    A_eq: Optional[np.ndarray] = None,
-    b_eq: Optional[np.ndarray] = None,
+    A_eq: np.ndarray | None = None,
+    b_eq: np.ndarray | None = None,
 ) -> np.ndarray:
     r"""Find an interior point of a polytope via linear programming.
 
@@ -448,7 +448,7 @@ def find_interior_point(
     elif result.status > 0:
         raise ValueError(
             "Problem checking constraint specification. "
-            + "linprog status: {}".format(result.message)
+            + f"linprog status: {result.message}"
         )
     # the x in the result is really (x, s)
     return result.x[:-1]
@@ -459,10 +459,10 @@ class PolytopeSampler(ABC):
 
     def __init__(
         self,
-        inequality_constraints: Optional[tuple[Tensor, Tensor]] = None,
-        equality_constraints: Optional[tuple[Tensor, Tensor]] = None,
-        bounds: Optional[Tensor] = None,
-        interior_point: Optional[Tensor] = None,
+        inequality_constraints: tuple[Tensor, Tensor] | None = None,
+        equality_constraints: tuple[Tensor, Tensor] | None = None,
+        bounds: Tensor | None = None,
+        interior_point: Tensor | None = None,
     ) -> None:
         r"""
         Args:
@@ -583,13 +583,13 @@ class HitAndRunPolytopeSampler(PolytopeSampler):
 
     def __init__(
         self,
-        inequality_constraints: Optional[tuple[Tensor, Tensor]] = None,
-        equality_constraints: Optional[tuple[Tensor, Tensor]] = None,
-        bounds: Optional[Tensor] = None,
-        interior_point: Optional[Tensor] = None,
+        inequality_constraints: tuple[Tensor, Tensor] | None = None,
+        equality_constraints: tuple[Tensor, Tensor] | None = None,
+        bounds: Tensor | None = None,
+        interior_point: Tensor | None = None,
         n_burnin: int = 200,
         n_thinning: int = 20,
-        seed: Optional[int] = None,
+        seed: int | None = None,
     ) -> None:
         r"""A sampler for sampling from a polyope using a hit-and-run algorithm.
 
@@ -660,9 +660,9 @@ def __init__(
         self.n_burnin: int = n_burnin
         self.n_thinning: int = n_thinning
         self.num_samples_generated: int = 0
-        self._seed: Optional[int] = seed
-        self._offset: Optional[Tensor] = offset
-        self._scale: Optional[Tensor] = scale
+        self._seed: int | None = seed
+        self._offset: Tensor | None = offset
+        self._scale: Tensor | None = scale
 
     def draw(self, n: int = 1) -> Tensor:
         r"""Draw samples from the polytope.
@@ -728,10 +728,10 @@ class DelaunayPolytopeSampler(PolytopeSampler):
 
     def __init__(
         self,
-        inequality_constraints: Optional[tuple[Tensor, Tensor]] = None,
-        equality_constraints: Optional[tuple[Tensor, Tensor]] = None,
-        bounds: Optional[Tensor] = None,
-        interior_point: Optional[Tensor] = None,
+        inequality_constraints: tuple[Tensor, Tensor] | None = None,
+        equality_constraints: tuple[Tensor, Tensor] | None = None,
+        bounds: Tensor | None = None,
+        interior_point: Tensor | None = None,
     ) -> None:
         r"""Initialize DelaunayPolytopeSampler.
 
@@ -790,7 +790,7 @@ def __init__(
             self._polytopes = polytopes
             self._p = volumes / volumes.sum()
 
-    def draw(self, n: int = 1, seed: Optional[int] = None) -> Tensor:
+    def draw(self, n: int = 1, seed: int | None = None) -> Tensor:
         r"""Draw samples from the polytope.
 
         Args:
@@ -884,9 +884,9 @@ def normalize_dense_linear_constraints(
 def get_polytope_samples(
     n: int,
     bounds: Tensor,
-    inequality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    equality_constraints: Optional[list[tuple[Tensor, Tensor, float]]] = None,
-    seed: Optional[int] = None,
+    inequality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    equality_constraints: list[tuple[Tensor, Tensor, float]] | None = None,
+    seed: int | None = None,
     n_burnin: int = 10_000,
     n_thinning: int = 32,
 ) -> Tensor:
@@ -990,8 +990,8 @@ def sparse_to_dense_constraints(
 def optimize_posterior_samples(
     paths: SamplePath,
     bounds: Tensor,
-    candidates: Optional[Tensor] = None,
-    raw_samples: Optional[int] = 1024,
+    candidates: Tensor | None = None,
+    raw_samples: int | None = 1024,
     num_restarts: int = 20,
     maximize: bool = True,
     **kwargs: Any,
diff --git a/botorch/utils/test_helpers.py b/botorch/utils/test_helpers.py
index 6d99758c7b..e6be02bd8e 100644
--- a/botorch/utils/test_helpers.py
+++ b/botorch/utils/test_helpers.py
@@ -12,7 +12,7 @@
 from __future__ import annotations
 
 import math
-from typing import Any, Optional, Union
+from typing import Any
 
 import torch
 from botorch.acquisition.objective import PosteriorTransform
@@ -58,7 +58,7 @@ def get_model(
     train_Y: Tensor,
     standardize_model: bool = False,
     use_model_list: bool = False,
-) -> Union[SingleTaskGP, ModelListGP]:
+) -> SingleTaskGP | ModelListGP:
     num_objectives = train_Y.shape[-1]
 
     if standardize_model:
@@ -173,11 +173,11 @@ def standardize_moments(
 
 
 def gen_multi_task_dataset(
-    yvar: Optional[float] = None,
-    task_values: Optional[list[int]] = None,
+    yvar: float | None = None,
+    task_values: list[int] | None = None,
     skip_task_features_in_datasets: bool = False,
     **tkwargs,
-) -> tuple[MultiTaskDataset, tuple[Tensor, Tensor, Optional[Tensor]]]:
+) -> tuple[MultiTaskDataset, tuple[Tensor, Tensor, Tensor | None]]:
     """Constructs a multi-task dataset with two tasks, each with 10 data points.
 
     Args:
diff --git a/botorch/utils/testing.py b/botorch/utils/testing.py
index aa58e85984..82196f7396 100644
--- a/botorch/utils/testing.py
+++ b/botorch/utils/testing.py
@@ -12,7 +12,7 @@
 from collections import OrderedDict
 from collections.abc import Sequence
 from itertools import product
-from typing import Any, Optional
+from typing import Any
 from unittest import mock, TestCase
 
 import torch
@@ -307,7 +307,7 @@ def variance(self):
 
     def rsample(
         self,
-        sample_shape: Optional[torch.Size] = None,
+        sample_shape: torch.Size | None = None,
     ) -> Tensor:
         """Mock sample by repeating self._samples. If base_samples is provided,
         do a shape check but return the same mock samples."""
@@ -346,8 +346,8 @@ def __init__(self, posterior: MockPosterior) -> None:  # noqa: D107
     def posterior(
         self,
         X: Tensor,
-        output_indices: Optional[list[int]] = None,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        output_indices: list[int] | None = None,
+        posterior_transform: PosteriorTransform | None = None,
         observation_noise: bool | torch.Tensor = False,
     ) -> MockPosterior:
         if posterior_transform is not None:
@@ -369,7 +369,7 @@ def state_dict(self, *args, **kwargs) -> None:
         pass
 
     def load_state_dict(
-        self, state_dict: Optional[OrderedDict] = None, strict: bool = False
+        self, state_dict: OrderedDict | None = None, strict: bool = False
     ) -> None:
         pass
 
@@ -384,7 +384,7 @@ def __init__(self):  # noqa: D107
     def __call__(self, X):
         return X[..., 0].max(dim=-1).values
 
-    def set_X_pending(self, X_pending: Optional[Tensor] = None):
+    def set_X_pending(self, X_pending: Tensor | None = None):
         self.X_pending = X_pending
 
 
@@ -482,7 +482,7 @@ def _get_max_violation_of_bounds(samples: torch.Tensor, bounds: torch.Tensor) ->
 
 def _get_max_violation_of_constraints(
     samples: torch.Tensor,
-    constraints: Optional[list[tuple[Tensor, Tensor, float]]],
+    constraints: list[tuple[Tensor, Tensor, float]] | None,
     equality: bool,
 ) -> float:
     r"""
diff --git a/botorch/utils/torch.py b/botorch/utils/torch.py
index 34fb004592..ccb7db8cd4 100644
--- a/botorch/utils/torch.py
+++ b/botorch/utils/torch.py
@@ -59,7 +59,7 @@ def __init__(self, buffers=None):
             buffers: A mapping (dictionary) from string to :class:`~torch.Tensor`, or
                 an iterable of key-value pairs of type (string, :class:`~torch.Tensor`).
         """
-        super(BufferDict, self).__init__()
+        super().__init__()
         if buffers is not None:
             self.update(buffers)
 
@@ -152,7 +152,7 @@ def extra_repr(self):
         child_lines = []
         for k, p in self._buffers.items():
             size_str = "x".join(str(size) for size in p.size())
-            device_str = "" if not p.is_cuda else " (GPU {})".format(p.get_device())
+            device_str = "" if not p.is_cuda else f" (GPU {p.get_device()})"
             parastr = "Buffer containing: [{} of size {}{}]".format(
                 torch.typename(p), size_str, device_str
             )
diff --git a/botorch/utils/transforms.py b/botorch/utils/transforms.py
index 6770db9bd8..a3e9b7c292 100644
--- a/botorch/utils/transforms.py
+++ b/botorch/utils/transforms.py
@@ -11,8 +11,9 @@
 from __future__ import annotations
 
 import warnings
+from collections.abc import Callable
 from functools import wraps
-from typing import Any, Callable, Optional, TYPE_CHECKING
+from typing import Any, TYPE_CHECKING
 
 import torch
 from botorch.utils.safe_math import logmeanexp
@@ -119,7 +120,7 @@ def unnormalize(X: Tensor, bounds: Tensor) -> Tensor:
     return X * (bounds[1] - bounds[0]) + bounds[0]
 
 
-def normalize_indices(indices: Optional[list[int]], d: int) -> Optional[list[int]]:
+def normalize_indices(indices: list[int] | None, d: int) -> list[int] | None:
     r"""Normalize a list of indices to ensure that they are positive.
 
     Args:
@@ -226,7 +227,7 @@ def is_ensemble(model: Model) -> bool:
 
 
 def t_batch_mode_transform(
-    expected_q: Optional[int] = None,
+    expected_q: int | None = None,
     assert_output_shape: bool = True,
 ) -> Callable[
     [Callable[[AcquisitionFunction, Any], Any]],
diff --git a/test/acquisition/multi_objective/test_monte_carlo.py b/test/acquisition/multi_objective/test_monte_carlo.py
index 0d8b784185..6acf3a5f16 100644
--- a/test/acquisition/multi_objective/test_monte_carlo.py
+++ b/test/acquisition/multi_objective/test_monte_carlo.py
@@ -8,7 +8,7 @@
 from copy import deepcopy
 from itertools import product
 from math import pi
-from typing import Any, Optional
+from typing import Any
 from unittest import mock
 from warnings import catch_warnings, simplefilter
 
@@ -110,7 +110,7 @@ def _test_q_expected_hypervolume_improvement(
         self,
         acqf_class: type[MultiObjectiveMCAcquisitionFunction],
         dtype: torch.dtype,
-        acqf_kwargs: Optional[dict[str, Any]] = None,
+        acqf_kwargs: dict[str, Any] | None = None,
     ):
         if acqf_kwargs is None:
             acqf_kwargs = {}
@@ -558,7 +558,7 @@ def _test_constrained_q_expected_hypervolume_improvement(
         self,
         acqf_class: type[AcquisitionFunction],
         dtype: torch.dtype,
-        acqf_kwargs: Optional[dict[str, Any]] = None,
+        acqf_kwargs: dict[str, Any] | None = None,
     ):
         if acqf_kwargs is None:
             acqf_kwargs = {}
diff --git a/test/acquisition/multi_objective/test_multi_output_risk_measures.py b/test/acquisition/multi_objective/test_multi_output_risk_measures.py
index 1cebfbc291..72781fb8bb 100644
--- a/test/acquisition/multi_objective/test_multi_output_risk_measures.py
+++ b/test/acquisition/multi_objective/test_multi_output_risk_measures.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import warnings
-from typing import Optional
 
 import torch
 from botorch import settings
@@ -32,7 +31,7 @@
 
 
 class NotSoAbstractMORiskMeasure(MultiOutputRiskMeasureMCObjective):
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         prepared_samples = self._prepare_samples(samples)
         return prepared_samples.sum(dim=-2)
 
diff --git a/test/acquisition/multi_objective/test_parego.py b/test/acquisition/multi_objective/test_parego.py
index 2624a456ef..3a8e99f32d 100644
--- a/test/acquisition/multi_objective/test_parego.py
+++ b/test/acquisition/multi_objective/test_parego.py
@@ -3,7 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from botorch.acquisition.logei import qLogNoisyExpectedImprovement
@@ -25,7 +25,7 @@ def base_test_parego(
         with_constraints: bool = False,
         with_scalarization_weights: bool = False,
         with_objective: bool = False,
-        model: Optional[Model] = None,
+        model: Model | None = None,
     ) -> None:
         if with_constraints:
             assert with_objective, "Objective must be specified if constraints are."
diff --git a/test/acquisition/test_cached_cholesky.py b/test/acquisition/test_cached_cholesky.py
index 8d813b8506..ffa9d545b0 100644
--- a/test/acquisition/test_cached_cholesky.py
+++ b/test/acquisition/test_cached_cholesky.py
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import warnings
-from typing import Optional
 from unittest import mock
 
 import torch
@@ -32,8 +31,8 @@ class DummyCachedCholeskyAcqf(MCAcquisitionFunction, CachedCholeskyMCSamplerMixi
     def __init__(
         self,
         model: Model,
-        objective: Optional[MCAcquisitionObjective] = None,
-        sampler: Optional[MCSampler] = None,
+        objective: MCAcquisitionObjective | None = None,
+        sampler: MCSampler | None = None,
         cache_root: bool = False,
     ):
         """A dummy cached cholesky acquisition function."""
diff --git a/test/acquisition/test_input_constructors.py b/test/acquisition/test_input_constructors.py
index ecab92375d..7478ee8a0b 100644
--- a/test/acquisition/test_input_constructors.py
+++ b/test/acquisition/test_input_constructors.py
@@ -12,8 +12,8 @@
 from __future__ import annotations
 
 import math
+from collections.abc import Callable
 from functools import reduce
-from typing import Callable
 from unittest import mock
 from unittest.mock import MagicMock
 
diff --git a/test/acquisition/test_max_value_entropy_search.py b/test/acquisition/test_max_value_entropy_search.py
index d610a16060..f46fd0dac4 100644
--- a/test/acquisition/test_max_value_entropy_search.py
+++ b/test/acquisition/test_max_value_entropy_search.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Callable, Optional
+from collections.abc import Callable
 from unittest import mock
 
 import torch
@@ -47,7 +47,7 @@ def posterior(
         self,
         X: Tensor,
         observation_noise: bool = False,
-        posterior_transform: Optional[PosteriorTransform] = None,
+        posterior_transform: PosteriorTransform | None = None,
     ) -> MockPosterior:
         m_shape = X.shape[:-1]
         r_shape = list(X.shape[:-2]) + [1, 1]
diff --git a/test/acquisition/test_objective.py b/test/acquisition/test_objective.py
index d8c3ae9b2f..b108c6df60 100644
--- a/test/acquisition/test_objective.py
+++ b/test/acquisition/test_objective.py
@@ -6,7 +6,6 @@
 
 import itertools
 import warnings
-from typing import Optional
 
 import torch
 from botorch.acquisition import LearnedObjective
@@ -422,8 +421,8 @@ def setUp(self, suppress_input_warnings: bool = False) -> None:
 
     def _get_pref_model(
         self,
-        dtype: Optional[torch.dtype] = None,
-        input_transform: Optional[Normalize] = None,
+        dtype: torch.dtype | None = None,
+        input_transform: Normalize | None = None,
     ) -> PairwiseGP:
         train_X = torch.rand((2, self.x_dim), dtype=dtype)
         train_comps = torch.LongTensor([[0, 1]])
diff --git a/test/acquisition/test_risk_measures.py b/test/acquisition/test_risk_measures.py
index b2c416bf91..2f655e113b 100644
--- a/test/acquisition/test_risk_measures.py
+++ b/test/acquisition/test_risk_measures.py
@@ -5,8 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 
-from typing import Optional
-
 import torch
 from botorch.acquisition.objective import LinearMCObjective
 from botorch.acquisition.risk_measures import (
@@ -21,7 +19,7 @@
 
 
 class NotSoAbstractRiskMeasure(RiskMeasureMCObjective):
-    def forward(self, samples: Tensor, X: Optional[Tensor] = None) -> Tensor:
+    def forward(self, samples: Tensor, X: Tensor | None = None) -> Tensor:
         prepared_samples = self._prepare_samples(samples)
         return prepared_samples.sum(dim=-1)
 
diff --git a/test/generation/test_gen.py b/test/generation/test_gen.py
index f260bb30e7..eb12bbd32e 100644
--- a/test/generation/test_gen.py
+++ b/test/generation/test_gen.py
@@ -230,11 +230,9 @@ def test_gen_candidates_scipy_warns_opt_failure(self):
             " and message ABNORMAL_TERMINATION_IN_LNSRCH."
         )
         expected_warning_raised = any(
-            (
-                issubclass(w.category, OptimizationWarning)
-                and expected_msg in str(w.message)
-                for w in ws
-            )
+            issubclass(w.category, OptimizationWarning)
+            and expected_msg in str(w.message)
+            for w in ws
         )
         self.assertTrue(expected_warning_raised)
 
@@ -300,11 +298,9 @@ def test_gen_candidates_scipy_warns_opt_no_res(self):
                 acquisition_function=MockAcquisitionFunction(),
             )
         expected_warning_raised = any(
-            (
-                issubclass(w.category, OptimizationWarning)
-                and expected_msg in str(w.message)
-                for w in ws
-            )
+            issubclass(w.category, OptimizationWarning)
+            and expected_msg in str(w.message)
+            for w in ws
         )
         self.assertTrue(expected_warning_raised)
 
diff --git a/test/models/test_fully_bayesian_multitask.py b/test/models/test_fully_bayesian_multitask.py
index e1c924b62b..070e8547f5 100644
--- a/test/models/test_fully_bayesian_multitask.py
+++ b/test/models/test_fully_bayesian_multitask.py
@@ -6,7 +6,6 @@
 
 
 import itertools
-from typing import Optional
 
 import torch
 from botorch import fit_fully_bayesian_model_nuts
@@ -76,8 +75,8 @@ class TestFullyBayesianMultiTaskGP(BotorchTestCase):
 
     def _get_data_and_model(
         self,
-        task_rank: Optional[int] = None,
-        output_tasks: Optional[list[int]] = None,
+        task_rank: int | None = None,
+        output_tasks: list[int] | None = None,
         infer_noise: bool = False,
         use_outcome_transform: bool = True,
         **tkwargs,
diff --git a/test/models/test_gpytorch.py b/test/models/test_gpytorch.py
index 6727b26e91..5b5ba99180 100644
--- a/test/models/test_gpytorch.py
+++ b/test/models/test_gpytorch.py
@@ -6,7 +6,6 @@
 
 import itertools
 import warnings
-from typing import Optional
 
 import torch
 from botorch import settings
@@ -62,7 +61,7 @@ def transform(self, X: Tensor) -> Tensor:
 class SimpleBatchedMultiOutputGPyTorchModel(
     BatchedMultiOutputGPyTorchModel, ExactGP, FantasizeMixin
 ):
-    _batch_shape: Optional[torch.Size] = None
+    _batch_shape: torch.Size | None = None
 
     def __init__(self, train_X, train_Y, outcome_transform=None, input_transform=None):
         r"""
diff --git a/test/models/test_model_list_gp_regression.py b/test/models/test_model_list_gp_regression.py
index f27080dafb..8cde27e6bc 100644
--- a/test/models/test_model_list_gp_regression.py
+++ b/test/models/test_model_list_gp_regression.py
@@ -7,7 +7,6 @@
 import itertools
 import warnings
 from copy import deepcopy
-from typing import Optional
 
 import torch
 from botorch.acquisition.objective import ScalarizedPosteriorTransform
@@ -561,7 +560,7 @@ def test_fantasize_with_outcome_transform(self) -> None:
                 def _get_fant_mean(
                     model: ModelListGP,
                     sampler: MCSampler,
-                    eval_mask: Optional[Tensor] = None,
+                    eval_mask: Tensor | None = None,
                 ) -> float:
                     fant = model.fantasize(
                         target_x,  # noqa
diff --git a/test/models/test_multitask.py b/test/models/test_multitask.py
index 9e5582451e..b9edc3e073 100644
--- a/test/models/test_multitask.py
+++ b/test/models/test_multitask.py
@@ -7,7 +7,7 @@
 import itertools
 import math
 import warnings
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from botorch.acquisition.objective import ScalarizedPosteriorTransform
@@ -43,11 +43,11 @@
 def _gen_model_and_data(
     fixed_noise: bool,
     task_feature: int = 0,
-    output_tasks: Optional[list[int]] = None,
-    task_values: Optional[list[int]] = None,
+    output_tasks: list[int] | None = None,
+    task_values: list[int] | None = None,
     skip_task_features_in_datasets: bool = False,
-    input_transform: Optional[InputTransform] = None,
-    outcome_transform: Optional[OutcomeTransform] = None,
+    input_transform: InputTransform | None = None,
+    outcome_transform: OutcomeTransform | None = None,
     **tkwargs,
 ):
     datasets, (train_X, train_Y, train_Yvar) = gen_multi_task_dataset(
diff --git a/test/models/test_pairwise_gp.py b/test/models/test_pairwise_gp.py
index 34b261f606..ff7d527f6c 100644
--- a/test/models/test_pairwise_gp.py
+++ b/test/models/test_pairwise_gp.py
@@ -7,7 +7,6 @@
 import itertools
 import random
 import warnings
-from typing import Union
 
 import torch
 from botorch.acquisition.objective import ScalarizedPosteriorTransform
@@ -63,7 +62,7 @@ def _get_model_and_data(
         batch_shape,
         X_dim=2,
         likelihood_cls=None,
-    ) -> tuple[Model, dict[str, Union[Tensor, PairwiseLikelihood]]]:
+    ) -> tuple[Model, dict[str, Tensor | PairwiseLikelihood]]:
         train_X, train_comp = self._make_rand_mini_data(
             batch_shape=batch_shape,
             X_dim=X_dim,
diff --git a/test/optim/test_fit.py b/test/optim/test_fit.py
index 55bcf33908..c28db6b5f6 100644
--- a/test/optim/test_fit.py
+++ b/test/optim/test_fit.py
@@ -114,7 +114,7 @@ def _test_fit_gpytorch_mll_scipy(self, mll):
                 njev=1,
                 nhev=1,
                 nit=1,
-                message="ABNORMAL_TERMINATION_IN_LNSRCH".encode(),
+                message=b"ABNORMAL_TERMINATION_IN_LNSRCH",
             )
             with catch_warnings(record=True) as ws, debug(True):
                 fit.fit_gpytorch_mll_scipy(mll, options=options)
diff --git a/test/optim/test_initializers.py b/test/optim/test_initializers.py
index 4b3ba3527c..925e9cec7a 100644
--- a/test/optim/test_initializers.py
+++ b/test/optim/test_initializers.py
@@ -8,7 +8,6 @@
 from contextlib import ExitStack
 from itertools import product
 from random import random
-from typing import Optional
 from unittest import mock
 
 import torch
@@ -506,8 +505,8 @@ def test_gen_batch_initial_conditions_sample_q_batches_from_polytope(self):
 
                 # samples are always on cpu
                 def _to_self_device(
-                    x: Optional[torch.Tensor],
-                ) -> Optional[torch.Tensor]:
+                    x: torch.Tensor | None,
+                ) -> torch.Tensor | None:
                     return None if x is None else x.to(device=self.device)
 
                 self.assertLess(
@@ -714,7 +713,7 @@ def test_gen_batch_initial_conditions_generator(self):
                 [True, False], [None, 1234], [None, 1], [None, {0: 0.5}]
             ):
 
-                def generator(n: int, q: int, seed: Optional[int]):
+                def generator(n: int, q: int, seed: int | None):
                     with manual_seed(seed):
                         X_rnd_nlzd = torch.rand(
                             n,
@@ -770,7 +769,7 @@ def generator(n: int, q: int, seed: Optional[int]):
     def test_error_generator_with_sample_around_best(self):
         tkwargs = {"device": self.device, "dtype": torch.double}
 
-        def generator(n: int, q: int, seed: Optional[int]):
+        def generator(n: int, q: int, seed: int | None):
             return torch.rand(n, q, 3).to(**tkwargs)
 
         with self.assertRaisesRegex(
diff --git a/test/optim/test_optimize.py b/test/optim/test_optimize.py
index df41158625..70555f8605 100644
--- a/test/optim/test_optimize.py
+++ b/test/optim/test_optimize.py
@@ -643,10 +643,8 @@ def test_optimize_acqf_warns_on_opt_failure(self):
             "`batch_initial_conditions.`"
         )
         expected_warning_raised = any(
-            (
-                issubclass(w.category, RuntimeWarning) and message in str(w.message)
-                for w in ws
-            )
+            issubclass(w.category, RuntimeWarning) and message in str(w.message)
+            for w in ws
         )
         self.assertTrue(expected_warning_raised)
 
@@ -690,10 +688,8 @@ def test_optimize_acqf_successfully_restarts_on_opt_failure(self):
             "_IN_LNSRCH.')]\nTrying again with a new set of initial conditions."
         )
         expected_warning_raised = any(
-            (
-                issubclass(w.category, RuntimeWarning) and message in str(w.message)
-                for w in ws
-            )
+            issubclass(w.category, RuntimeWarning) and message in str(w.message)
+            for w in ws
         )
         self.assertTrue(expected_warning_raised)
         # check if it succeeded on restart -- the maximum value of sin(1/x) is 1
@@ -714,10 +710,8 @@ def test_optimize_acqf_successfully_restarts_on_opt_failure(self):
                 retry_on_optimization_warning=False,
             )
         expected_warning_raised = any(
-            (
-                issubclass(w.category, RuntimeWarning) and message in str(w.message)
-                for w in ws
-            )
+            issubclass(w.category, RuntimeWarning) and message in str(w.message)
+            for w in ws
         )
         self.assertFalse(expected_warning_raised)
 
@@ -765,16 +759,12 @@ def test_optimize_acqf_warns_on_second_opt_failure(self):
             "of initial conditions."
         )
         first_expected_warning_raised = any(
-            (
-                issubclass(w.category, RuntimeWarning) and message_1 in str(w.message)
-                for w in ws
-            )
+            issubclass(w.category, RuntimeWarning) and message_1 in str(w.message)
+            for w in ws
         )
         second_expected_warning_raised = any(
-            (
-                issubclass(w.category, RuntimeWarning) and message_2 in str(w.message)
-                for w in ws
-            )
+            issubclass(w.category, RuntimeWarning) and message_2 in str(w.message)
+            for w in ws
         )
         self.assertTrue(first_expected_warning_raised)
         self.assertTrue(second_expected_warning_raised)
diff --git a/test/optim/test_parameter_constraints.py b/test/optim/test_parameter_constraints.py
index b4da8e472a..435c99fcb0 100644
--- a/test/optim/test_parameter_constraints.py
+++ b/test/optim/test_parameter_constraints.py
@@ -4,8 +4,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+from collections.abc import Callable
 from itertools import product
-from typing import Callable
 
 import numpy as np
 import torch
diff --git a/test/posteriors/test_transformed.py b/test/posteriors/test_transformed.py
index 66c03a307c..28fd6dbd64 100644
--- a/test/posteriors/test_transformed.py
+++ b/test/posteriors/test_transformed.py
@@ -84,7 +84,7 @@ def test_transformed_posterior(self):
         # check that `mean` works even if posterior doesn't have `variance`
         for error in (AttributeError, NotImplementedError):
 
-            class DummyPosterior(object):
+            class DummyPosterior:
                 mean = torch.zeros(5)
 
                 @property
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 4f85b8bcb9..1af9ec2594 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -15,7 +15,6 @@
 import unittest
 from itertools import chain
 from pathlib import Path
-from typing import Union
 
 import torch
 from botorch.utils.testing import BotorchTestCase
@@ -29,7 +28,7 @@ def test_cuda(self):
         self.assertTrue(run_cuda_tests(tests))
 
 
-def run_cuda_tests(tests: Union[unittest.TestCase, unittest.TestSuite]) -> bool:
+def run_cuda_tests(tests: unittest.TestCase | unittest.TestSuite) -> bool:
     """Function for running all tests on cuda (except TestBotorchCUDA itself)"""
     if isinstance(tests, BotorchTestCase):
         tests.device = torch.device("cuda")
diff --git a/test/test_fit.py b/test/test_fit.py
index 62e18ccea6..20dd7a732a 100644
--- a/test/test_fit.py
+++ b/test/test_fit.py
@@ -5,11 +5,10 @@
 # LICENSE file in the root directory of this source tree.
 
 import math
-from collections.abc import Iterable
+from collections.abc import Callable, Iterable
 from contextlib import ExitStack, nullcontext
 from copy import deepcopy
 from itertools import filterfalse, product
-from typing import Callable, Optional
 from unittest.mock import MagicMock, patch
 from warnings import catch_warnings, warn, WarningMessage
 
@@ -40,7 +39,7 @@ def __init__(
         self,
         randomize_requires_grad: bool = True,
         warnings: Iterable[WarningMessage] = (),
-        exception: Optional[BaseException] = None,
+        exception: BaseException | None = None,
     ):
         r"""Class used to mock `optimizer` argument to `fit_gpytorch_mll."""
         self.randomize_requires_grad = randomize_requires_grad
@@ -49,7 +48,7 @@ def __init__(
         self.call_count = 0
         self.state_dicts = []
 
-    def __call__(self, mll, closure: Optional[Callable] = None) -> OptimizationResult:
+    def __call__(self, mll, closure: Callable | None = None) -> OptimizationResult:
         self.call_count += 1
         for w in self.warnings:
             warn(str(w.message), w.category)
diff --git a/test/utils/probability/test_bvn.py b/test/utils/probability/test_bvn.py
index 37910762aa..819acb6320 100644
--- a/test/utils/probability/test_bvn.py
+++ b/test/utils/probability/test_bvn.py
@@ -6,8 +6,10 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 from itertools import count
-from typing import Any, Callable, Optional, Union
+from typing import Any
 
 import torch
 from botorch.exceptions import UnsupportedError
@@ -24,11 +26,11 @@
 
 
 def run_gaussian_estimator(
-    estimator: Callable[[Tensor], tuple[Tensor, Union[Tensor, float, int]]],
+    estimator: Callable[[Tensor], tuple[Tensor, Tensor | float | int]],
     sqrt_cov: Tensor,
     num_samples: int,
-    batch_limit: Optional[int] = None,
-    seed: Optional[int] = None,
+    batch_limit: int | None = None,
+    seed: int | None = None,
 ) -> Tensor:
 
     if batch_limit is None:
@@ -70,7 +72,7 @@ def setUp(
         mc_atol_multiplier: float = 4.0,
         seed: int = 1,
         dtype: torch.dtype = torch.float64,
-        device: Optional[torch.device] = None,
+        device: torch.device | None = None,
     ):
         super().setUp()
         self.dtype = dtype
diff --git a/test/utils/probability/test_mvnxpb.py b/test/utils/probability/test_mvnxpb.py
index 7693b01530..931d13374f 100644
--- a/test/utils/probability/test_mvnxpb.py
+++ b/test/utils/probability/test_mvnxpb.py
@@ -6,13 +6,13 @@
 
 from __future__ import annotations
 
-from collections.abc import Sequence
+from collections.abc import Callable, Sequence
 
 from copy import deepcopy
 
 from functools import partial
 from itertools import count
-from typing import Any, Callable, Optional, Union
+from typing import Any
 from unittest.mock import patch
 
 import torch
@@ -24,11 +24,11 @@
 
 
 def run_gaussian_estimator(
-    estimator: Callable[[Tensor], tuple[Tensor, Union[Tensor, float, int]]],
+    estimator: Callable[[Tensor], tuple[Tensor, Tensor | float | int]],
     sqrt_cov: Tensor,
     num_samples: int,
-    batch_limit: Optional[int] = None,
-    seed: Optional[int] = None,
+    batch_limit: int | None = None,
+    seed: int | None = None,
 ) -> Tensor:
 
     if batch_limit is None:
@@ -118,7 +118,7 @@ def gen_bounds(
         self,
         ndim: int,
         batch_shape: Sequence[int] = (),
-        bound_range: Optional[tuple[float, float]] = None,
+        bound_range: tuple[float, float] | None = None,
     ) -> tuple[Tensor, Tensor]:
         shape = tuple(batch_shape) + (ndim,)
         lower = torch.rand(shape, **self.tkwargs)
diff --git a/test/utils/probability/test_unified_skew_normal.py b/test/utils/probability/test_unified_skew_normal.py
index 34da68b887..f2ab236a8c 100644
--- a/test/utils/probability/test_unified_skew_normal.py
+++ b/test/utils/probability/test_unified_skew_normal.py
@@ -12,7 +12,7 @@
 
 from itertools import count
 
-from typing import Any, Optional
+from typing import Any
 
 import torch
 from botorch.utils.probability.mvnxpb import MVNXPB
@@ -39,7 +39,7 @@ def setUp(
         mc_atol_multiplier: float = 4.0,
         seed: int = 1,
         dtype: torch.dtype = torch.float64,
-        device: Optional[torch.device] = None,
+        device: torch.device | None = None,
     ):
         super().setUp()
         self.dtype = dtype
diff --git a/test/utils/test_datasets.py b/test/utils/test_datasets.py
index b617ec1680..5301066ca0 100644
--- a/test/utils/test_datasets.py
+++ b/test/utils/test_datasets.py
@@ -4,7 +4,6 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Optional
 
 import torch
 from botorch.exceptions.errors import InputDataError, UnsupportedError
@@ -25,9 +24,9 @@ def make_dataset(
     d: int = 2,
     m: int = 1,
     has_yvar: bool = False,
-    feature_names: Optional[list[str]] = None,
-    outcome_names: Optional[list[str]] = None,
-    batch_shape: Optional[torch.Size] = None,
+    feature_names: list[str] | None = None,
+    outcome_names: list[str] | None = None,
+    batch_shape: torch.Size | None = None,
 ) -> SupervisedDataset:
     feature_names = feature_names or [f"x{i}" for i in range(d)]
     outcome_names = outcome_names or [f"y{i}" for i in range(m)]
diff --git a/test/utils/test_safe_math.py b/test/utils/test_safe_math.py
index 97192b6201..35389dceaf 100644
--- a/test/utils/test_safe_math.py
+++ b/test/utils/test_safe_math.py
@@ -10,8 +10,8 @@
 
 import math
 from abc import abstractmethod
+from collections.abc import Callable
 from itertools import combinations, product
-from typing import Callable
 
 import torch
 from botorch.exceptions import UnsupportedError
diff --git a/test/utils/test_sampling.py b/test/utils/test_sampling.py
index 30be5694e5..78418d2879 100644
--- a/test/utils/test_sampling.py
+++ b/test/utils/test_sampling.py
@@ -581,4 +581,4 @@ def test_optimize_posterior_samples(self):
 
             # Check that the all found optima are larger than the observations
             # This is not 100% deterministic, but just about.
-            self.assertTrue(torch.all((f_opt > Y.max(dim=-2).values)))
+            self.assertTrue(torch.all(f_opt > Y.max(dim=-2).values))