diff --git a/dali/python/nvidia/dali/external_source.py b/dali/python/nvidia/dali/external_source.py
index 9e7ab9b65c..bb86627e93 100644
--- a/dali/python/nvidia/dali/external_source.py
+++ b/dali/python/nvidia/dali/external_source.py
@@ -26,6 +26,7 @@
     SourceKind as _SourceKind,
 )
 
+
 def _get_shape(data):
     if isinstance(data, (_tensors.TensorCPU, _tensors.TensorGPU)):
         if callable(data.shape):
diff --git a/dali/python/nvidia/dali/pipeline.py b/dali/python/nvidia/dali/pipeline.py
index d7dfbbc6b8..192e049cd8 100644
--- a/dali/python/nvidia/dali/pipeline.py
+++ b/dali/python/nvidia/dali/pipeline.py
@@ -478,6 +478,11 @@ def is_restored_from_checkpoint(self):
         """If True, this pipeline was restored from checkpoint."""
         return self._is_restored_from_checkpoint
 
+    @property
+    def num_outputs(self):
+        """Number of pipeline outputs."""
+        return self._num_outputs
+
     def output_dtype(self) -> list:
         """Data types expected at the outputs."""
         return [elem if elem != types.NO_TYPE else None for elem in self._pipe.output_dtype()]
@@ -855,6 +860,7 @@ def contains_nested_datanode(nested):
             self._require_no_foreign_ops("The pipeline does not support checkpointing")
 
         self._graph_outputs = outputs
+        self._num_outputs = len(self._graph_outputs)
         self._setup_input_callbacks()
         self._disable_pruned_external_source_instances()
         self._py_graph_built = True
diff --git a/dali/python/nvidia/dali/plugin/pytorch/experimental/proxy/__init__.py b/dali/python/nvidia/dali/plugin/pytorch/experimental/proxy/__init__.py
index f3fb023d41..11668f9b00 100644
--- a/dali/python/nvidia/dali/plugin/pytorch/experimental/proxy/__init__.py
+++ b/dali/python/nvidia/dali/plugin/pytorch/experimental/proxy/__init__.py
@@ -12,17 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import torch
-import torch.multiprocessing as mp
-from torch.utils import data as torchdata
-from torch.utils.data._utils.collate import default_collate_fn_map
-from nvidia.dali import Pipeline
-from nvidia.dali.external_source import ExternalSource
+__all__ = ["DALIServer", "DataLoader"]
+
+import torch as _torch
+import torch.multiprocessing as _mp
+from torch.utils import data as _torchdata
+from torch.utils.data._utils.collate import default_collate_fn_map as _default_collate_fn_map
+from nvidia.dali import Pipeline as _Pipeline
+from nvidia.dali.external_source import ExternalSource as _ExternalSource
 import threading
 import queue
 from queue import Empty
 from nvidia.dali.plugin.pytorch.torch_utils import to_torch_tensor
 import warnings
+import inspect
 
 # DALI proxy is a PyTorch specific layer that connects a multi-process PyTorch DataLoader with
 # a single DALI pipeline. This allows to run CUDA processing in a single process, avoiding the
@@ -37,9 +40,9 @@
 # +-------+   +---------------+   +-------------+ +---------------+   +-----------+ +-----------+
 # | main  |   | dali_output_q |   | data_thread | | dali_input_q  |   | worker_0  | | worker_1  |
 # +-------+   +---------------+   +-------------+ +---------------+   +-----------+ +-----------+
-#     |~~~get()~~~~~~>|                  |                |                 |             |
-#     |               |                  |~~~get()~~~~~~~>|                 |             |
-#     |               |                  |                |                 |             |
+#     |~~~get()~~~~~~>|                  |                |           | sample 0  |       |
+#     |               |                  |~~~get()~~~~~~~>|           | sample 1  |       |
+#     |               |                  |                |           | sample N  |       |
 #     |               |                  |                |           -------------       |
 #     |               |                  |                |           | collate   |       |
 #     |               |                  |                |           -------------       |
@@ -49,10 +52,10 @@
 #     |               |                  |                |---------------->|             |
 #     |               |                  |                |                 |             |
 #     |               |                  |<--req_0_0------|                 |             |
-#     |               |                  |                |                 |             |
-#     |               |           ---------------         |                 |             |
-#     |               |           | run         |         |                 |             |
-#     |               |           ---------------         |                 |             |
+#     |               |                  |                |                 |       -------------
+#     |               |           ---------------         |                 |       | sample 0  |
+#     |               |           | run #0      |         |                 |       | sample 1  |
+#     |               |           ---------------         |                 |       | sample N  |
 #     |               |                  |                |                 |       -------------
 #     |               |                  |                |                 |       | collate   |
 #     |               |                  |                |                 |       -------------
@@ -61,11 +64,11 @@
 #     |               |                  |                |                 |             |
 #     |               |                  |                |------------------------------>|
 #     |               |                  |                |                 |             |
-#     |               |<~~put(data_0_0)~~|                |                 |             |
-#     |               |                  |                |                 |             |
-#     |               |----------------->|                |                 |             |
-#     |               |                  |                |                 |             |
-#     |               |                  |                |                 |       -------------
+#     |               |<~~put(data_0_0)~~|                |                 |       -------------
+#     |               |                  |                |                 |       | sample 0  |
+#     |               |----------------->|                |                 |       | sample 1  |
+#     |<--data_0_0----|                  |                |                 |       | sample N  |
+#     |~~~get()~~~~~~>|                  |                |                 |       -------------
 #     |               |                  |                |                 |       | collate   |
 #     |               |                  |                |                 |       -------------
 #     |               |                  |                |                 |             |
@@ -73,17 +76,17 @@
 #     |               |                  |                |                 |             |
 #     |               |                  |                |------------------------------>|
 #     |               |                  |                |                 |             |
-#     |<--data_0_0----|                  |                |                 |             |
-#     |               |                  |                |                 |             |
-#     |~~~get()~~~~~~>|                  |                |                 |             |
-#     |               |                  |~~~get()~~~~~~~>|                 |             |
 #     |               |                  |                |                 |             |
-#     |               |                  |<--req_1_0------|                 |             |
 #     |               |                  |                |                 |             |
-#     |               |           ---------------         |                 |             |
-#     |               |           | run         |         |                 |             |
-#     |               |           ---------------         |                 |             |
 #     |               |                  |                |                 |             |
+#     |               |                  |~~~get()~~~~~~~>|                 |             |
+#     |               |                  |                |           -------------       |
+#     |               |                  |<--req_1_0------|           | sample 0  |       |
+#     |               |                  |                |           | sample 1  |       |
+#     |               |           ---------------         |           | sample N  |       |
+#     |               |           | run #1      |         |           -------------       |
+#     |               |           ---------------         |           | collate   |       |
+#     |               |                  |                |           -------------       |
 #     |               |                  |                |<~~put(req_0_1)~~|             |
 #     |               |                  |                |                 |             |
 #     |               |                  |                |-----------------|             |
@@ -91,9 +94,9 @@
 #     |               |<~~put(data_1_0)~~|                |                 |             |
 #     |               |                  |                |                 |             |
 #     |               |----------------->|                |                 |             |
-#     |               |                  |                |                 |             |
 #     |<--data_1_0----|                  |                |                 |             |
 #     |               |                  |                |                 |             |
+#     |               |                  |                |                 |             |
 # +-------+   +---------------+   +-------------+ +---------------+   +-----------+ +-----------+
 # | main  |   | dali_output_q |   | data_thread | | dali_input_q  |   | worker_0  | | worker_1  |
 # +-------+   +---------------+   +-------------+ +---------------+   +-----------+ +-----------+
@@ -114,74 +117,87 @@ def _external_source_node_names(pipeline):
         pipeline._build_graph()
     input_node_names = []
     for op in pipeline._ops:
-        if isinstance(op._op, ExternalSource):
+        if isinstance(op._op, _ExternalSource):
             input_node_names.append(op.name)
     return input_node_names
 
 
+class DALIOutputSampleRef:
+    """
+    Reference for a single sample output bound to a pipeline run.
+    """
+
+    def __init__(self, proxy, pipe_run_ref, output_idx, sample_idx):
+        self.proxy = proxy
+        self.pipe_run_ref = pipe_run_ref
+        self.output_idx = output_idx
+        self.sample_idx = sample_idx
+
+    def __repr__(self):
+        return (
+            f"DALIOutputSampleRef({self.pipe_run_ref}, "
+            + f"output_idx={self.output_idx}, sample_idx={self.sample_idx})"
+        )
+
+
 class DALIPipelineRunRef:
     """
     Reference for a DALI pipeline run iteration.
     """
 
-    def __init__(self, batch_id, inputs):
-        """
-        batch_id: Identifier of the batch
-        is_scheduled: Whether the iteration has been scheduled for execution already
-        inputs: Inputs to be used when scheduling the iteration (makes sense only if
-        is_scheduled is False)
-        """
+    def __init__(self, proxy, batch_id):
         self.batch_id = batch_id
-        self.inputs = inputs
-        assert self.inputs is not None
+        self.inputs = {name: [] for name in proxy._dali_input_names}
         self.is_scheduled = False
+        self.is_complete = False
+
+    def __repr__(self):
+        return (
+            f"DALIPipelineRunRef(batch_id={self.batch_id}, is_scheduled="
+            f"{self.is_scheduled}, is_complete={self.is_complete})"
+        )
 
 
-class DALIProcessedSampleRef:
+class DALIOutputBatchRef:
     """
-    Placeholder for a pipeline run reference, which is returned by the data worker instead of
-    the actual data. The PyTorch worker returns this trivial object, only containing information
-    about this proxy instance and the input data to the pipeline. Later in the collate function,
-    we send the data for execution to DALI.
+    Reference for a batched output bound to a pipeline run.
     """
 
-    def __init__(self, proxy, inputs):
-        self.proxy = proxy
-        self.inputs = inputs
-        if len(inputs) != len(self.proxy._dali_input_names):
-            raise RuntimeError(
-                f"Unexpected number of inputs. Expected: {self._dali_input_names}, got: {inputs}"
-            )
+    def __init__(self, pipe_run_ref, output_idx):
+        self.pipe_run_ref = pipe_run_ref
+        self.output_idx = output_idx
+
+    def __repr__(self):
+        return f"DALIOutputBatchRef(pipe_run_ref={self.pipe_run_ref}, output_idx={self.output_idx})"
 
 
-def _collate_dali_processed_sample_ref_fn(samples, *, collate_fn_map=None):
+def _collate_dali_output_sample_ref_fn(samples, *, collate_fn_map=None):
     """
     Special collate function that schedules a DALI iteration for execution
     """
     assert len(samples) > 0
-    sample = samples[0]
-    inputs = [[] for _ in range(len(sample.inputs))]
-    proxy = sample.proxy
-    for sample in samples:
-        assert proxy == sample.proxy
-        for idx, input_ref in enumerate(sample.inputs):
-            inputs[idx].append(input_ref)
-    pipe_run_ref = proxy._create_pipe_run_ref(inputs)
-    if not proxy._deterministic:
+    pipe_run_ref = samples[0].pipe_run_ref
+    output_idx = samples[0].output_idx
+    proxy = samples[0].proxy
+    for i, sample in enumerate(samples):
+        # Sanity check
+        assert sample.proxy == proxy
+        assert sample.pipe_run_ref == pipe_run_ref
+        assert sample.output_idx == output_idx
+        assert sample.sample_idx == i, f"{samples}"
+    if not proxy._deterministic and not pipe_run_ref.is_scheduled:
         proxy._schedule_batch(pipe_run_ref)
-        # No need for the inputs now
         pipe_run_ref.inputs = None
-        # Mark as already scheduled
-        pipe_run_ref.is_scheduled = True
-    return pipe_run_ref
+    pipe_run_ref.is_complete = True
+    return DALIOutputBatchRef(pipe_run_ref, output_idx)
 
 
-# In-place modify `default_collate_fn_map` to handle DALIProcessedSampleRef
-default_collate_fn_map.update({DALIProcessedSampleRef: _collate_dali_processed_sample_ref_fn})
+# In-place modify `default_collate_fn_map` to handle DALIOutputSampleRef
+_default_collate_fn_map.update({DALIOutputSampleRef: _collate_dali_output_sample_ref_fn})
 
 
 class _DALIProxy:
-    def __init__(self, dali_input_names, dali_input_q, deterministic):
+    def __init__(self, dali_input_names, dali_input_q, dali_num_outputs, deterministic):
         # External source instance names
         self._dali_input_names = dali_input_names
         # If True, the request is not sent to DALI upon creation, so that it can be scheduled
@@ -189,43 +205,64 @@ def __init__(self, dali_input_names, dali_input_q, deterministic):
         self._deterministic = deterministic
         # Shared queue with the server
         self._dali_input_q = dali_input_q
-        # Torch worker id, to be filled on first call to worker_id()
-        self._worker_id = None
-        # Iteration index for the current worker
-        self.data_idx = 0
-
-    @property
-    def worker_id(self):
+        # Number of outputs in the pipeline
+        self._dali_num_outputs = dali_num_outputs
+        # Per worker
+        self._worker_data = None
+
+    def _init_worker_data(self):
+        self._worker_data = {
+            "worker_id": self._get_worker_id(),
+            "data_idx": 0,
+            "pipe_run_ref": None,
+            "batch_sample_idx": 0,
+        }
+
+    def _get_worker_data(self):
+        if self._worker_data is None:
+            self._init_worker_data()
+        return self._worker_data
+
+    def _get_worker_id(self):
         """
         Getter for 'worker_id'. In case of torch data worker it is the worker info, and
         in case of a thread the thread identifier
         """
-        if self._worker_id is None:
-            worker_info = torchdata.get_worker_info()
-            self._worker_id = worker_info.id if worker_info else threading.get_ident()
-        return self._worker_id
-
-    def _create_pipe_run_ref(self, inputs):
-        # Identifier of this request
-        batch_id = (self.worker_id, self.data_idx)
-        self.data_idx = self.data_idx + 1
-        return DALIPipelineRunRef(batch_id, inputs=inputs)
+        worker_info = _torchdata.get_worker_info()
+        return worker_info.id if worker_info else threading.get_ident()
+
+    def _add_sample(self, bound_args):
+        state = self._get_worker_data()
+        if state["pipe_run_ref"] is None or state["pipe_run_ref"].is_complete:
+            state["pipe_run_ref"] = DALIPipelineRunRef(
+                self, (state["worker_id"], state["data_idx"])
+            )
+            state["data_idx"] += 1
+            state["batch_sample_idx"] = 0
+
+        for name, value in bound_args.arguments.items():
+            if name != "self":
+                state["pipe_run_ref"].inputs[name].append(value)
+
+        ret = tuple(
+            DALIOutputSampleRef(
+                self,
+                pipe_run_ref=state["pipe_run_ref"],
+                output_idx=i,
+                sample_idx=state["batch_sample_idx"],
+            )
+            for i in range(self._dali_num_outputs)
+        )
+        state["batch_sample_idx"] += 1
+        return ret[0] if len(ret) == 1 else ret
 
     def _schedule_batch(self, pipe_run_ref):
-        torch.cuda.nvtx.range_push(f"dali_proxy.dali_input_q.put {pipe_run_ref.batch_id}")
+        if pipe_run_ref.inputs is None:
+            raise RuntimeError("No inputs for the pipeline to run (was it already scheduled?)")
+        _torch.cuda.nvtx.range_push(f"dali_proxy.dali_input_q.put {pipe_run_ref.batch_id}")
         self._dali_input_q.put((pipe_run_ref.batch_id, pipe_run_ref.inputs))
-        torch.cuda.nvtx.range_pop()
-
-    def __call__(self, *inputs):
-        """
-        Returns a reference to the pipeline run
-        """
-        if len(inputs) != len(self._dali_input_names):
-            raise RuntimeError(
-                f"Unexpected number of inputs. Expected: {self._dali_input_names}, got: {inputs}"
-            )
-
-        return DALIProcessedSampleRef(self, inputs)
+        pipe_run_ref.is_scheduled = True
+        _torch.cuda.nvtx.range_pop()
 
 
 class DALIServer:
@@ -309,24 +346,16 @@ def read_filepath(path):
                         pass
 
         """
-        assert isinstance(pipeline, Pipeline), f"Expected an NVIDIA DALI pipeline, got: {pipeline}"
-        self._pipe = pipeline
-        self._pipe_input_names = _external_source_node_names(self._pipe)
-        if len(self._pipe_input_names) == 0:
-            raise RuntimeError("The provided pipeline doesn't have any inputs")
-        elif len(self._pipe_input_names) == 1:
-            assert input_names is None or input_names[0] == self._pipe_input_names[0]
-            self._dali_input_names = self._pipe_input_names
-        elif input_names is None or len(input_names) != len(self._pipe_input_names):
-            raise RuntimeError(
-                "The provided pipeline has more than one output. In such case, the argument "
-                "`input_names` should containi the same exact number of strings, one for "
-                "each pipeline input to be mapped by the proxy callable object"
-            )
-        self._num_inputs = len(self._dali_input_names)
-
+        if not isinstance(pipeline, _Pipeline):
+            raise RuntimeError(f"Expected an NVIDIA DALI pipeline, got: {pipeline}")
+        else:
+            self._pipe = pipeline
+        # get and validate dali pipeline input names
+        self._dali_input_names, self._allow_positional_args = self._check_dali_input_names(
+            input_names
+        )
         # Multi-process queue used to transfer data from the pytorch workers to the main process
-        self._dali_input_q = mp.Queue()
+        self._dali_input_q = _mp.Queue()
         # Multi-process queue used by the main process to consume outputs from the DALI pipeline
         self._dali_output_q = queue.Queue()
         # Thread
@@ -335,15 +364,68 @@ def read_filepath(path):
         self._cache_outputs = dict()
         # Whether we want the order of DALI execution to be reproducible
         self._deterministic = deterministic
+        # Proxy
+        self._proxy = None
+
+    def _check_dali_input_names(self, input_names):
+        pipe_input_names = _external_source_node_names(self._pipe)
+        if len(pipe_input_names) == 0:
+            raise RuntimeError("The provided pipeline doesn't have any inputs")
+        pipe_input_names_set = set(pipe_input_names)
+        input_names_set = set(input_names or [])
+        if len(input_names_set) != len(input_names_set):
+            raise RuntimeError("``input_names`` argument should not contain any duplicated values")
+
+        if len(input_names_set) == 0:
+            allow_positional_args = True if len(pipe_input_names) == 1 else False
+            return pipe_input_names, allow_positional_args
+
+        if input_names_set != pipe_input_names_set:
+            raise RuntimeError(
+                "The set of DALI input names provided should match exactly the "
+                "ones provided by the pipeline. "
+                f"\nProvided input names are: {input_names}"
+                f"\nPipeline input names are: {pipe_input_names}"
+            )
+        return input_names, True
 
     @property
     def proxy(self):
-        return _DALIProxy(self._dali_input_names, self._dali_input_q, self._deterministic)
+        if self._proxy is None:
+            parameters = [inspect.Parameter("self", inspect.Parameter.POSITIONAL_OR_KEYWORD)]
+            for input_name in self._dali_input_names:
+                if self._allow_positional_args:
+                    parameters.append(
+                        inspect.Parameter(input_name, inspect.Parameter.POSITIONAL_OR_KEYWORD)
+                    )
+                else:
+                    parameters.append(inspect.Parameter(input_name, inspect.Parameter.KEYWORD_ONLY))
+
+            signature = inspect.Signature(parameters)
+
+            def call_impl(self, *args, **kwargs):
+                try:
+                    bound_args = signature.bind(self, *args, **kwargs)
+                except Exception as exc:
+                    raise RuntimeError(f"{exc}. Signature is {signature}")
+                return self._add_sample(bound_args)
+
+            call_impl.__signature__ = inspect.Signature(parameters)
+            _DALIProxy.__call__ = call_impl
+            self._proxy = _DALIProxy(
+                self._dali_input_names,
+                self._dali_input_q,
+                self._pipe.num_outputs,
+                self._deterministic,
+            )
+
+        return self._proxy
 
     def _schedule_batch(self, pipe_run_ref):
-        torch.cuda.nvtx.range_push(f"dali_proxy.dali_input_q.put {pipe_run_ref.batch_id}")
+        _torch.cuda.nvtx.range_push(f"dali_proxy.dali_input_q.put {pipe_run_ref.batch_id}")
         self._dali_input_q.put((pipe_run_ref.batch_id, pipe_run_ref.inputs))
-        torch.cuda.nvtx.range_pop()
+        pipe_run_ref.is_scheduled = True
+        _torch.cuda.nvtx.range_pop()
 
     def _get_outputs(self, pipe_run_ref: DALIPipelineRunRef):
         """
@@ -355,7 +437,6 @@ def _get_outputs(self, pipe_run_ref: DALIPipelineRunRef):
         # In case we haven't scheduled the iteration yet (i.e. deterministic config), do it now
         if not pipe_run_ref.is_scheduled:
             self._schedule_batch(pipe_run_ref)
-            pipe_run_ref.is_scheduled = True
 
         # Wait for the requested output to be ready
         req_outputs = None
@@ -368,9 +449,9 @@ def _get_outputs(self, pipe_run_ref: DALIPipelineRunRef):
             curr_batch_id = None
             # If not the data we are looking for, store it and keep processing until we find it
             while req_batch_id != curr_batch_id:
-                torch.cuda.nvtx.range_push("dali_output_q.get")
+                _torch.cuda.nvtx.range_push("dali_output_q.get")
                 curr_batch_id, curr_processed_outputs, err = self._dali_output_q.get()
-                torch.cuda.nvtx.range_pop()
+                _torch.cuda.nvtx.range_pop()
 
                 if err is not None:
                     raise err
@@ -379,47 +460,78 @@ def _get_outputs(self, pipe_run_ref: DALIPipelineRunRef):
                     req_outputs = curr_processed_outputs
                 else:
                     self._cache_outputs[curr_batch_id] = curr_processed_outputs
-        # Unpack single element tuples
-        if isinstance(req_outputs, tuple) and len(req_outputs) == 1:
-            req_outputs = req_outputs[0]
         return req_outputs
 
-    def produce_data(self, obj):
+    def _produce_data_impl(self, obj, cache):
         """
-        A generic function to recursively visits all elements in a nested structure and replace
-        instances of DALIPipelineRunRef with the actual data provided by the DALI server
-
-        Args:
-            obj: The object to map (can be an instance of any class).
-
-        Returns:
-            A new object where any instance of DALIPipelineRunRef has been replaced with actual
-            data.
+        Recursive implementation of produce_data
         """
         # If it is an instance of DALIPipelineRunRef, replace it with data
-        if isinstance(obj, DALIPipelineRunRef):
-            return self._get_outputs(obj)
+        if isinstance(obj, DALIOutputBatchRef):
+            if obj.pipe_run_ref.batch_id not in cache:
+                cache[obj.pipe_run_ref.batch_id] = self._get_outputs(obj.pipe_run_ref)
+            return cache[obj.pipe_run_ref.batch_id][obj.output_idx]
+
         # If it is a custom class, recursively call produce data on its members
         elif hasattr(obj, "__dict__"):
             for attr_name, attr_value in obj.__dict__.items():
-                setattr(obj, attr_name, self.produce_data(attr_value))
+                setattr(obj, attr_name, self._produce_data_impl(attr_value, cache))
             return obj
 
         # If it's a list, recursively apply the function to each element
         elif isinstance(obj, list):
-            return [self.produce_data(item) for item in obj]
+            return [self._produce_data_impl(item, cache) for item in obj]
 
         # If it's a tuple, recursively apply the function to each element (and preserve tuple type)
         elif isinstance(obj, tuple):
-            return tuple(self.produce_data(item) for item in obj)
+            return tuple(self._produce_data_impl(item, cache) for item in obj)
 
-        # If it's a dictionary, apply the function to both keys and values
+        # If it's a dictionary, apply the function to the values
         elif isinstance(obj, dict):
-            return {key: self.produce_data(value) for key, value in obj.items()}
+            return {key: self._produce_data_impl(value, cache) for key, value in obj.items()}
 
         else:  # return directly anything else
             return obj
 
+    def produce_data(self, obj):
+        """
+        A generic function to recursively visits all elements in a nested structure and replace
+        instances of DALIOutputBatchRef with the actual data provided by the DALI server
+
+        Args:
+            obj: The object to map (can be an instance of any class).
+
+        Returns:
+            A new object where any instance of DALIOutputBatchRef has been replaced with actual
+            data.
+        """
+        cache = dict()
+        ret = self._produce_data_impl(obj, cache)
+        del cache
+        return ret
+
+    def _get_input_batches(self, max_num_batches, timeout=None):
+        _torch.cuda.nvtx.range_push("dali_input_q.get")
+        count = 0
+        batches = []
+        if timeout is not None:
+            try:
+                batches.append(self._dali_input_q.get(timeout=timeout))
+                count = count + 1
+            except Empty:
+                return None
+            except _mp.TimeoutError:
+                return None
+
+        while count < max_num_batches:
+            try:
+                batches.append(self._dali_input_q.get_nowait())
+                count = count + 1
+            except Empty:
+                break
+        _torch.cuda.nvtx.range_pop()
+        return batches
+
     def _thread_fn(self):
         """
         Asynchronous DALI thread that gets iteration data from the queue and schedules it
@@ -427,23 +539,33 @@ def _thread_fn(self):
         """
         self._pipe.build()  # just in case
 
+        fed_batches = queue.Queue()
         while not self._thread_stop_event.is_set():
-            try:
-                torch.cuda.nvtx.range_push("dali_input_q.get")
-                batch_id, inputs = self._dali_input_q.get(timeout=5)
-                torch.cuda.nvtx.range_pop()
-            except mp.TimeoutError:
-                continue
-            except Empty:
+            _torch.cuda.nvtx.range_push("get_input_batches")
+            timeout = 5 if fed_batches.empty() else None
+            # We try to feed as many batches as the prefetch queue (if available)
+            batches = self._get_input_batches(
+                self._pipe.prefetch_queue_depth - fed_batches.qsize(), timeout=timeout
+            )
+            _torch.cuda.nvtx.range_pop()
+            if batches is not None and len(batches) > 0:
+                _torch.cuda.nvtx.range_push("feed_pipeline")
+                for batch_id, inputs in batches:
+                    for input_name, input_data in inputs.items():
+                        self._pipe.feed_input(input_name, input_data)
+                    self._pipe._run_once()
+                    fed_batches.put(batch_id)
+                _torch.cuda.nvtx.range_pop()
+
+            # If no batches to consume, continue
+            if fed_batches.qsize() == 0:
                 continue
 
+            _torch.cuda.nvtx.range_push("outputs")
+            batch_id = fed_batches.get_nowait()  # we are sure there's at least one
             err = None
             torch_outputs = None
-            torch.cuda.nvtx.range_push(f"schedule iteration {batch_id}")
             try:
-                for idx, input_name in enumerate(self._dali_input_names):
-                    self._pipe.feed_input(input_name, inputs[idx])
-                self._pipe._run_once()
                 pipe_outputs = self._pipe.outputs()
                 torch_outputs = tuple(
                     [
@@ -453,8 +575,9 @@ def _thread_fn(self):
                 )
             except Exception as exception:
                 err = exception
+
             self._dali_output_q.put((batch_id, torch_outputs, err))
-            torch.cuda.nvtx.range_pop()
+            _torch.cuda.nvtx.range_pop()
 
     def start_thread(self):
         """
@@ -488,13 +611,13 @@ def __exit__(self, exc_type, exc_value, tb):
         return False  # Return False to propagate exceptions
 
 
-class DataLoader(torchdata.dataloader.DataLoader):
+class DataLoader(_torchdata.dataloader.DataLoader):
     """
     DALI data loader to be used in the main loop, which runs the DALI pipeline doing the
     processing asynchronously with regards to the training.
     """
 
-    class _Iter(torchdata.dataloader._MultiProcessingDataLoaderIter):
+    class _Iter(_torchdata.dataloader._MultiProcessingDataLoaderIter):
         """
         Data loader iterator used by the DALI proxy data loader
         """
@@ -502,13 +625,12 @@ class _Iter(torchdata.dataloader._MultiProcessingDataLoaderIter):
         def __init__(self, loader):
             super().__init__(loader)
             self.loader = loader
+            if self.loader.dali_server._thread is None:
+                raise RuntimeError("DALI server is not running")
 
         def _next_data(self):
             data = super()._next_data()
-            if self.loader.dali_server._thread is None:
-                raise RuntimeError("DALI server is not running")
-            data = self.loader.dali_server.produce_data(data)
-            return data
+            return self.loader.dali_server.produce_data(data)
 
     def __init__(self, dali_server, *args, **kwargs):
         super().__init__(*args, **kwargs)
diff --git a/dali/test/python/test_dali_proxy.py b/dali/test/python/test_dali_proxy.py
index ef6ed96a35..ee4e5264a6 100644
--- a/dali/test/python/test_dali_proxy.py
+++ b/dali/test/python/test_dali_proxy.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from nvidia.dali import pipeline_def, fn, types
 import numpy as np
 import os
@@ -5,6 +19,7 @@
 from nose_utils import attr, assert_raises
 import PIL.Image
 
+
 def read_file(path):
     return np.fromfile(path, dtype=np.uint8)
 
@@ -49,9 +64,7 @@ def image_pipe(dali_device="gpu", include_decoder=True, random_pipe=True):
         if random_pipe:
             shapes = images.shape()
             crop_anchor, crop_shape = fn.random_crop_generator(
-                shapes,
-                random_aspect_ratio=[0.75, 4.0 / 3.0],
-                random_area=[0.08, 1.0]
+                shapes, random_aspect_ratio=[0.75, 4.0 / 3.0], random_area=[0.08, 1.0]
             )
             images = fn.slice(images, start=crop_anchor, shape=crop_shape, axes=[0, 1])
 
@@ -75,7 +88,7 @@ def image_pipe(dali_device="gpu", include_decoder=True, random_pipe=True):
 
 
 @attr("pytorch")
-@params(("cpu", False), ("cpu", True), ("cpu", False), ("gpu", True))
+@params(("cpu", False), ("cpu", True), ("gpu", False), ("gpu", True))
 def test_dali_proxy_torch_data_loader(device, include_decoder, debug=False):
     # Shows how DALI proxy is used in practice with a PyTorch data loader
 
@@ -114,7 +127,9 @@ def test_dali_proxy_torch_data_loader(device, include_decoder, debug=False):
 
         if include_decoder:
             dataset = datasets.ImageFolder(jpeg, transform=dali_server.proxy, loader=read_filepath)
-            dataset_ref = datasets.ImageFolder(jpeg, transform=lambda x: x.copy(), loader=read_filepath)
+            dataset_ref = datasets.ImageFolder(
+                jpeg, transform=lambda x: x.copy(), loader=read_filepath
+            )
         else:
             dataset = datasets.ImageFolder(jpeg, transform=dali_server.proxy)
             dataset_ref = datasets.ImageFolder(jpeg, transform=lambda x: x.copy())
@@ -149,7 +164,9 @@ def ref_collate_fn(batch):
                 target.shape,
             )
             np.testing.assert_array_equal(target, ref_target)
-            ref_data_nparrays = [np.array(obj) if isinstance(obj, PIL.Image.Image) else obj for obj in ref_data]
+            ref_data_nparrays = [
+                np.array(obj) if isinstance(obj, PIL.Image.Image) else obj for obj in ref_data
+            ]
             ref_data_tensors = [TensorCPU(arr) for arr in ref_data_nparrays]
             pipe_ref.feed_input("images", ref_data_tensors)
             (ref_data,) = pipe_ref.run()
@@ -161,7 +178,8 @@ def ref_collate_fn(batch):
 
 
 @attr("pytorch")
-def test_dali_proxy_torch_data_loader_manual_integration(device="gpu", debug=False):
+@params(("gpu",))
+def test_dali_proxy_manual_integration(device, debug=False):
     # Shows how to integrate with DALI proxy manually with an existing data loader
 
     from nvidia.dali.plugin.pytorch.experimental import proxy as dali_proxy
@@ -251,18 +269,16 @@ def __getitem__(self, idx):
             return img2, other
 
     # This is just for educational purposes. It is recommended to rely
-    # default_collate_fn_map, which is updated to handle DALIProcessedSampleRef
+    # default_collate_fn_map, which is updated to handle DALIOuputSampleRef
     def custom_collate_fn(batch):
         images, labels = zip(*batch)
-        return dali_proxy._collate_dali_processed_sample_ref_fn(images), torch.tensor(
+        return dali_proxy._collate_dali_output_sample_ref_fn(images), torch.tensor(
             labels, dtype=torch.long
         )
 
     # Run the server (it also cleans up on scope exit)
     with dali_proxy.DALIServer(pipe) as dali_server:
-
         dataset = CustomDatasetDALI(plain_dataset, dali_server.proxy)
-
         loader = torchdata.dataloader.DataLoader(
             dataset,
             batch_size=batch_size,
@@ -272,9 +288,8 @@ def custom_collate_fn(batch):
         )
 
         assert len(loader) > 0
-
         for next_input, next_target in loader:
-            assert isinstance(next_input, dali_proxy.DALIPipelineRunRef)
+            assert isinstance(next_input, dali_proxy.DALIOutputBatchRef)
             next_input = dali_server.produce_data(next_input)
             assert isinstance(next_input, torch.Tensor)
             np.testing.assert_equal([batch_size, 3, 224, 224], next_input.shape)
@@ -392,3 +407,142 @@ def pipe_with_error():
     # messages in the next test
     pipe._shutdown()
     del pipe
+
+
+@attr("pytorch")
+@params(("cpu",), ("gpu",))
+def test_dali_proxy_duplicated_outputs(device, debug=False):
+    from nvidia.dali.plugin.pytorch.experimental import proxy as dali_proxy
+    from torch.utils import data as torchdata
+    from PIL import Image
+
+    batch_size = 4
+    num_threads = 3
+    device_id = 0
+    nworkers = 4
+    pipe = image_pipe(
+        dali_device=device,
+        include_decoder=False,
+        random_pipe=False,
+        batch_size=batch_size,
+        num_threads=num_threads,
+        device_id=device_id,
+        prefetch_queue_depth=2 + nworkers,
+    )
+
+    class MyDataset(torchdata.Dataset):
+        def __init__(self, folder_path, transform):
+            self.folder_path = folder_path
+            self.image_files = self._find_images_in_folder(folder_path)
+            self.transform = transform
+
+        def _find_images_in_folder(self, folder_path):
+            """
+            Recursively find all image files in the folder and its subdirectories.
+            """
+            image_files = []
+
+            # Walk through all directories and subdirectories
+            for root, _, files in os.walk(folder_path):
+                for file in files:
+                    if file.lower().endswith((".png", ".jpg", ".jpeg", ".bmp", ".gif")):
+                        image_files.append(os.path.join(root, file))
+
+            return image_files
+
+        def __len__(self):
+            """Returns the number of images in the folder."""
+            return len(self.image_files)
+
+        def __getitem__(self, idx):
+            img_name = self.image_files[idx]
+            img_path = os.path.join(self.folder_path, img_name)
+            img = Image.open(img_path).convert("RGB")  # Convert image to RGB (3 channels)
+            img = self.transform(img)
+            return img, 1, img
+
+    with dali_proxy.DALIServer(pipe) as dali_server:
+        dataset = MyDataset(jpeg, transform=dali_server.proxy)
+        loader = dali_proxy.DataLoader(
+            dali_server,
+            dataset,
+            batch_size=batch_size,
+            num_workers=nworkers,
+            drop_last=True,
+        )
+
+        for data1, _, data2 in loader:
+            np.testing.assert_array_equal(data1, data2)
+
+
+@attr("pytorch")
+@params(("cpu",), ("gpu",))
+def test_dali_proxy_rearrange_output_order_and_positional_args(device, debug=False):
+    from nvidia.dali.plugin.pytorch.experimental import proxy as dali_proxy
+    from torch.utils import data as torchdata
+
+    batch_size = 4
+    num_threads = 3
+    device_id = 0
+    nworkers = 4
+    arrs = np.random.rand(20, 3)
+
+    @pipeline_def
+    def pipe_2_outputs():
+        a = fn.external_source(name="a", no_copy=True)
+        b = fn.external_source(name="b", no_copy=True)
+        if device == "gpu":
+            a = a.gpu()
+            b = b.gpu()
+        return a + b, b - a
+
+    pipe1 = pipe_2_outputs(
+        batch_size=batch_size,
+        num_threads=num_threads,
+        device_id=device_id,
+        prefetch_queue_depth=2 + nworkers,
+    )
+    pipe2 = pipe_2_outputs(
+        batch_size=batch_size,
+        num_threads=num_threads,
+        device_id=device_id,
+        prefetch_queue_depth=2 + nworkers,
+    )
+
+    class MyDataset(torchdata.Dataset):
+        def __init__(self, arrs, transform, reverse_order):
+            self.arrs = arrs
+            self.n = len(arrs)
+            self.transform = transform
+            self.reverse_order = reverse_order
+
+        def __len__(self):
+            """Returns the number of images in the folder."""
+            return self.n
+
+        def __getitem__(self, idx):
+            a = self.arrs[idx]
+            b = self.arrs[idx + 1 if idx < self.n - 1 else 0]
+            a_plus_b, b_minus_a = self.transform(b=b, a=a)  # reverse order in purpose
+            return (b_minus_a, 1, a_plus_b) if self.reverse_order else (a_plus_b, 1, b_minus_a)
+
+    with dali_proxy.DALIServer(pipe1) as dali_server1, dali_proxy.DALIServer(pipe2) as dali_server2:
+        loader1 = dali_proxy.DataLoader(
+            dali_server1,
+            MyDataset(arrs, dali_server1.proxy, reverse_order=False),
+            batch_size=batch_size,
+            num_workers=nworkers,
+            drop_last=True,
+        )
+        loader2 = dali_proxy.DataLoader(
+            dali_server2,
+            MyDataset(arrs, dali_server2.proxy, reverse_order=True),
+            batch_size=batch_size,
+            num_workers=nworkers,
+            drop_last=True,
+        )
+
+        for data1, data2 in zip(loader1, loader2):
+            np.testing.assert_array_equal(data1[0].cpu(), data2[2].cpu())
+            np.testing.assert_array_equal(data1[1].cpu(), data2[1].cpu())
+            np.testing.assert_array_equal(data1[2].cpu(), data2[0].cpu())
diff --git a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dali.py b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dali.py
index c43c5f689e..88ff08c44a 100644
--- a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dali.py
+++ b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dali.py
@@ -21,31 +21,82 @@
 
 @pipeline_def(enable_conditionals=True, exec_dynamic=True)
 def training_pipe(data_dir, interpolation, image_size, output_layout, automatic_augmentation,
-                  dali_device="gpu", rank=0, world_size=1, send_filepaths=False):
+                  dali_device="gpu", rank=0, world_size=1):
     rng = fn.random.coin_flip(probability=0.5)
 
-    if data_dir is None:
-        if send_filepaths:
-            filepaths = fn.external_source(name="images", no_copy=True)
-            jpegs = fn.io.file.read(filepaths)
-        else:
-            jpegs = fn.external_source(name="images", no_copy=True)
+    jpegs, labels = fn.readers.file(name="Reader", file_root=data_dir, shard_id=rank,
+                                    num_shards=world_size, random_shuffle=True, pad_last_batch=True)
+
+    decoder_device = "mixed" if dali_device == "gpu" else "cpu"
+    images = fn.decoders.image_random_crop(jpegs, device=decoder_device, output_type=types.RGB,
+                                           random_aspect_ratio=[0.75, 4.0 / 3.0],
+                                           random_area=[0.08, 1.0])
+
+    images = fn.resize(images, size=[image_size, image_size],
+                       interp_type=interpolation, antialias=False)
+
+    # Make sure that from this point we are processing on GPU regardless of dali_device parameter
+    images = images.gpu()
+
+    images = fn.flip(images, horizontal=rng)
+
+    # Based on the specification, apply the automatic augmentation policy. Note, that from the point
+    # of Pipeline definition, this `if` statement relies on static scalar parameter, so it is
+    # evaluated exactly once during build - we either include automatic augmentations or not.
+    # We pass the shape of the image after the resize so the translate operations are done
+    # relative to the image size.
+    if automatic_augmentation == "autoaugment":
+        output = auto_augment.auto_augment_image_net(images, shape=[image_size, image_size])
+    elif automatic_augmentation == "trivialaugment":
+        output = trivial_augment.trivial_augment_wide(images, shape=[image_size, image_size])
     else:
-        jpegs, labels = fn.readers.file(name="Reader", file_root=data_dir, shard_id=rank,
-                                        num_shards=world_size, random_shuffle=True, pad_last_batch=True)
+        output = images
 
-    if dali_device == "gpu":
-        decoder_device = "mixed"
-        resize_device = "gpu"
+    output = fn.crop_mirror_normalize(output, dtype=types.FLOAT, output_layout=output_layout,
+                                      crop=(image_size, image_size),
+                                      mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+                                      std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
+    return output, labels
+
+
+@pipeline_def(exec_dynamic=True)
+def validation_pipe(data_dir, interpolation, image_size, image_crop, output_layout,
+                    dali_device="gpu", rank=0, world_size=1):
+    jpegs, label = fn.readers.file(name="Reader", file_root=data_dir, shard_id=rank,
+                                   num_shards=world_size, random_shuffle=False, pad_last_batch=True)
+
+    decoder_device = "mixed" if dali_device == "gpu" else "cpu"
+    images = fn.decoders.image(jpegs, device=decoder_device, output_type=types.RGB)
+
+    images = fn.resize(images, resize_shorter=image_size,
+                       interp_type=interpolation, antialias=False)
+
+    images = images.gpu()
+
+    output = fn.crop_mirror_normalize(images, dtype=types.FLOAT, output_layout=output_layout,
+                                      crop=(image_crop, image_crop),
+                                      mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+                                      std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
+    return output, label
+
+
+
+@pipeline_def(enable_conditionals=True, exec_dynamic=True)
+def training_pipe_external_source(interpolation, image_size, output_layout, automatic_augmentation,
+                                  dali_device="gpu", send_filepaths=False):
+    rng = fn.random.coin_flip(probability=0.5)
+    if send_filepaths:
+        filepaths = fn.external_source(name="images", no_copy=True)
+        jpegs = fn.io.file.read(filepaths)
     else:
-        decoder_device = "cpu"
-        resize_device = "cpu"
+        jpegs = fn.external_source(name="images", no_copy=True)
 
+    decoder_device = "mixed" if dali_device == "gpu" else "cpu"
     images = fn.decoders.image_random_crop(jpegs, device=decoder_device, output_type=types.RGB,
                                            random_aspect_ratio=[0.75, 4.0 / 3.0],
                                            random_area=[0.08, 1.0])
 
-    images = fn.resize(images, device=resize_device, size=[image_size, image_size],
+    images = fn.resize(images, size=[image_size, image_size],
                        interp_type=interpolation, antialias=False)
 
     # Make sure that from this point we are processing on GPU regardless of dali_device parameter
@@ -70,35 +121,22 @@ def training_pipe(data_dir, interpolation, image_size, output_layout, automatic_
                                       mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
                                       std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
 
-    if data_dir is None:
-        return output
-    else:
-        return output, labels
+    return output
 
 
 @pipeline_def(exec_dynamic=True)
-def validation_pipe(data_dir, interpolation, image_size, image_crop, output_layout,
-                    dali_device="gpu", rank=0, world_size=1, send_filepaths=False):
-    if data_dir is None:
-        if send_filepaths:
-            filepaths = fn.external_source(name="images", no_copy=True)
-            jpegs = fn.io.file.read(filepaths)
-        else:
-            jpegs = fn.external_source(name="images", no_copy=True)
+def validation_pipe_external_source(interpolation, image_size, image_crop, output_layout,
+                                    dali_device="gpu", send_filepaths=False):
+    if send_filepaths:
+        filepaths = fn.external_source(name="images", no_copy=True)
+        jpegs = fn.io.file.read(filepaths)
     else:
-        jpegs, label = fn.readers.file(name="Reader", file_root=data_dir, shard_id=rank,
-                                       num_shards=world_size, random_shuffle=False, pad_last_batch=True)
-
-    if dali_device == "gpu":
-        decoder_device = "mixed"
-        resize_device = "gpu"
-    else:
-        decoder_device = "cpu"
-        resize_device = "cpu"
+        jpegs = fn.external_source(name="images", no_copy=True)
 
+    decoder_device = "mixed" if dali_device == "gpu" else "cpu"
     images = fn.decoders.image(jpegs, device=decoder_device, output_type=types.RGB)
 
-    images = fn.resize(images, device=resize_device, resize_shorter=image_size,
+    images = fn.resize(images, resize_shorter=image_size,
                        interp_type=interpolation, antialias=False)
 
     images = images.gpu()
@@ -108,7 +146,4 @@ def validation_pipe(data_dir, interpolation, image_size, image_crop, output_layo
                                       mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
                                       std=[0.229 * 255, 0.224 * 255, 0.225 * 255])
 
-    if data_dir is None:
-        return output
-    else:
-        return output, label
+    return output
diff --git a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py
index 7e9ab3d73e..66b060004d 100644
--- a/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py
+++ b/docs/examples/use_cases/pytorch/efficientnet/image_classification/dataloaders.py
@@ -41,6 +41,7 @@
     import nvidia.dali.types as types
 
     from image_classification.dali import training_pipe, validation_pipe
+    from image_classification.dali import training_pipe_external_source, validation_pipe_external_source
 
     DATA_BACKEND_CHOICES.append("dali")
     DATA_BACKEND_CHOICES.append("dali_proxy")
@@ -157,7 +158,7 @@ def gdtl(
         pipe = training_pipe(data_dir=traindir, interpolation=interpolation, image_size=image_size,
                              output_layout=output_layout, automatic_augmentation=augmentation,
                              dali_device=dali_device, rank=rank, world_size=world_size,
-                             prefetch_queue_depth=8,
+                             prefetch_queue_depth=workers*2,
                              **pipeline_kwargs)
 
         pipe.build()
@@ -261,6 +262,7 @@ def expand(num_classes, dtype, tensor):
 
 
 class PrefetchedWrapper(object):
+    @staticmethod
     def prefetched_loader(loader, num_classes, one_hot, normalize, memory_format, data_layout):
         if normalize:
             mean = (
@@ -480,10 +482,11 @@ def get_impl(data_path,
             "seed": 12 + rank % torch.cuda.device_count(),
         }
 
-        pipe = training_pipe(data_dir=None, interpolation=interpolation, image_size=image_size,
-                             output_layout=output_layout, automatic_augmentation=augmentation,
-                             dali_device=dali_device, prefetch_queue_depth=8, send_filepaths=send_filepaths,
-                             **pipeline_kwargs)
+        pipe = training_pipe_external_source(
+            interpolation=interpolation, image_size=image_size,
+            output_layout=output_layout, automatic_augmentation=augmentation,
+            dali_device=dali_device, prefetch_queue_depth=workers*2, send_filepaths=send_filepaths,
+            **pipeline_kwargs)
         pipe.build()
 
         dali_server = dali_proxy.DALIServer(pipe)
@@ -552,12 +555,12 @@ def get_impl(data_path,
             "seed": 12 + rank % torch.cuda.device_count(),
         }
 
-        pipe = validation_pipe(data_dir=None, interpolation=interpolation,
-                               image_size=image_size + crop_padding, image_crop=image_size,
-                               output_layout=output_layout,
-                               send_filepaths=send_filepaths,
-                               **pipeline_kwargs)
-
+        pipe = validation_pipe_external_source(
+            interpolation=interpolation,
+            image_size=image_size + crop_padding, image_crop=image_size,
+            output_layout=output_layout,
+            send_filepaths=send_filepaths,
+            **pipeline_kwargs)
         pipe.build()
 
         dali_server = dali_proxy.DALIServer(pipe)
diff --git a/docs/examples/use_cases/pytorch/resnet50/main.py b/docs/examples/use_cases/pytorch/resnet50/main.py
index 79b52dd43e..7db1ef1a24 100644
--- a/docs/examples/use_cases/pytorch/resnet50/main.py
+++ b/docs/examples/use_cases/pytorch/resnet50/main.py
@@ -123,19 +123,12 @@ def to_python_float(t):
 
 @pipeline_def(exec_dynamic=True)
 def create_dali_pipeline(data_dir, crop, size, shard_id, num_shards, dali_cpu=False, is_training=True):
-    if data_dir is None:
-        if args.send_filepaths:
-            filepaths = fn.external_source(name="images", no_copy=True)
-            images = fn.io.file.read(filepaths)
-        else:
-            images = fn.external_source(name="images", no_copy=True)
-    else:
-        images, labels = fn.readers.file(file_root=data_dir,
-                                         shard_id=shard_id,
-                                         num_shards=num_shards,
-                                         random_shuffle=is_training,
-                                         pad_last_batch=True,
-                                         name="Reader")
+    images, labels = fn.readers.file(file_root=data_dir,
+                                        shard_id=shard_id,
+                                        num_shards=num_shards,
+                                        random_shuffle=is_training,
+                                        pad_last_batch=True,
+                                        name="Reader")
 
     dali_device = 'cpu' if dali_cpu else 'gpu'
     decoder_device = 'cpu' if dali_cpu else 'mixed'
@@ -174,12 +167,58 @@ def create_dali_pipeline(data_dir, crop, size, shard_id, num_shards, dali_cpu=Fa
                                       mean=[0.485 * 255,0.456 * 255,0.406 * 255],
                                       std=[0.229 * 255,0.224 * 255,0.225 * 255],
                                       mirror=mirror)
+    labels = labels.gpu()
+    return images, labels
+
+
+
+@pipeline_def(exec_dynamic=True)
+def create_dali_pipeline_external_source(crop, size, dali_cpu=False, is_training=True):
+    if args.send_filepaths:
+        filepaths = fn.external_source(name="images", no_copy=True)
+        images = fn.io.file.read(filepaths)
+    else:
+        images = fn.external_source(name="images", no_copy=True)
 
-    if data_dir is None:
-        return images
+    dali_device = 'cpu' if dali_cpu else 'gpu'
+    decoder_device = 'cpu' if dali_cpu else 'mixed'
+    # ask HW NVJPEG to allocate memory ahead for the biggest image in the data set to avoid reallocations in runtime
+    preallocate_width_hint = 5980 if decoder_device == 'mixed' else 0
+    preallocate_height_hint = 6430 if decoder_device == 'mixed' else 0
+    if is_training:
+        images = fn.decoders.image_random_crop(images,
+                                               device=decoder_device, output_type=types.RGB,
+                                               preallocate_width_hint=preallocate_width_hint,
+                                               preallocate_height_hint=preallocate_height_hint,
+                                               random_aspect_ratio=[0.8, 1.25],
+                                               random_area=[0.1, 1.0],
+                                               num_attempts=100)
+        images = fn.resize(images,
+                           device=dali_device,
+                           resize_x=crop,
+                           resize_y=crop,
+                           interp_type=types.INTERP_TRIANGULAR)
+        mirror = fn.random.coin_flip(probability=0.5)
     else:
-        labels = labels.gpu()
-        return images, labels
+        images = fn.decoders.image(images,
+                                   device=decoder_device,
+                                   output_type=types.RGB)
+        images = fn.resize(images,
+                           device=dali_device,
+                           size=size,
+                           mode="not_smaller",
+                           interp_type=types.INTERP_TRIANGULAR)
+        mirror = False
+
+    images = fn.crop_mirror_normalize(images.gpu(),
+                                      dtype=types.FLOAT,
+                                      output_layout="CHW",
+                                      crop=(crop, crop),
+                                      mean=[0.485 * 255,0.456 * 255,0.406 * 255],
+                                      std=[0.229 * 255,0.224 * 255,0.225 * 255],
+                                      mirror=mirror)
+
+    return images
 
 
 def main():
@@ -300,12 +339,12 @@ def resume():
     val_pipe = None
     dali_server_train = None
     dali_server_val = None
-    if not args.disable_dali:
+    if not args.disable_dali and not args.dali_proxy:
         train_pipe = create_dali_pipeline(batch_size=args.batch_size,
                                           num_threads=args.workers,
                                           device_id=args.local_rank,
                                           seed=12 + args.local_rank,
-                                          data_dir=traindir if not args.dali_proxy else None,
+                                          data_dir=traindir,
                                           crop=crop_size,
                                           size=val_size,
                                           dali_cpu=args.dali_cpu,
@@ -318,7 +357,7 @@ def resume():
                                         num_threads=args.workers,
                                         device_id=args.local_rank,
                                         seed=12 + args.local_rank,
-                                        data_dir=valdir if not args.dali_proxy else None,
+                                        data_dir=valdir,
                                         crop=crop_size,
                                         size=val_size,
                                         dali_cpu=args.dali_cpu,
@@ -327,14 +366,35 @@ def resume():
                                         is_training=False)
         val_pipe.build()
 
-    if not args.disable_dali and not args.dali_proxy:
         train_loader = DALIClassificationIterator(train_pipe, reader_name="Reader",
                                                   last_batch_policy=LastBatchPolicy.PARTIAL,
                                                   auto_reset=True)
         val_loader = DALIClassificationIterator(val_pipe, reader_name="Reader",
                                                 last_batch_policy=LastBatchPolicy.PARTIAL,
                                                 auto_reset=True)
-    elif args.dali_proxy:
+    elif not args.disable_dali and args.dali_proxy:
+        train_pipe = create_dali_pipeline_external_source(
+            batch_size=args.batch_size,
+            num_threads=args.workers,
+            device_id=args.local_rank,
+            seed=12 + args.local_rank,
+            crop=crop_size,
+            size=val_size,
+            dali_cpu=args.dali_cpu,
+            is_training=True)
+        train_pipe.build()
+
+        val_pipe = create_dali_pipeline_external_source(
+            batch_size=args.batch_size,
+            num_threads=args.workers,
+            device_id=args.local_rank,
+            seed=12 + args.local_rank,
+            crop=crop_size,
+            size=val_size,
+            dali_cpu=args.dali_cpu,
+            is_training=False)
+        val_pipe.build()
+
         assert train_pipe is not None
         assert val_pipe is not None
         dali_server_train = dali_proxy.DALIServer(train_pipe)