diff --git a/torchbenchmark/models/moco/__init__.py b/torchbenchmark/models/moco/__init__.py
index d6290a895e..0acad7dc17 100644
--- a/torchbenchmark/models/moco/__init__.py
+++ b/torchbenchmark/models/moco/__init__.py
@@ -56,18 +56,29 @@ def __init__(self, test, device, batch_size=None, extra_args=[]):
                 "distributed": True,
             }
         )
-        try:
-            dist.init_process_group(
-                backend="nccl",
-                init_method="tcp://localhost:10001",
-                world_size=1,
-                rank=0,
-            )
-        except RuntimeError:
-            pass  # already initialized?
 
         if device == "cpu":
             raise NotImplementedError("DistributedDataParallel/allgather requires cuda")
+        elif device == "cuda":
+            try:
+                dist.init_process_group(
+                    backend="nccl",
+                    init_method="tcp://localhost:10001",
+                    world_size=1,
+                    rank=0,
+                )
+            except RuntimeError:
+                pass  # already initialized?
+        elif device.startswith("xla"):
+            import torch_xla.distributed.xla_backend
+
+            try:
+                dist.init_process_group(backend="xla", init_method="xla://")
+            except RuntimeError:
+                pass  # already initialized?
+        else:
+            raise NotImplementedError(f"{device} not supported")
+
 
         self.model = MoCo(
             models.__dict__[self.opt.arch],
@@ -102,8 +113,8 @@ def collate_train_fn(data):
             range(2), collate_fn=collate_train_fn
         )
         for i, (images, _) in enumerate(self.example_inputs):
-            images[0] = images[0].cuda(device=0, non_blocking=True)
-            images[1] = images[1].cuda(device=0, non_blocking=True)
+            images[0] = images[0].to(device, non_blocking=True)
+            images[1] = images[1].to(device, non_blocking=True)
 
     def get_module(self):
         """Recommended
diff --git a/torchbenchmark/models/moco/moco/builder.py b/torchbenchmark/models/moco/moco/builder.py
index 295e22a7e7..a0a0329134 100644
--- a/torchbenchmark/models/moco/moco/builder.py
+++ b/torchbenchmark/models/moco/moco/builder.py
@@ -79,7 +79,7 @@ def _batch_shuffle_ddp(self, x):
         num_gpus = batch_size_all // batch_size_this
 
         # random shuffle index
-        idx_shuffle = torch.randperm(batch_size_all).cuda()
+        idx_shuffle = torch.randperm(batch_size_all, device=x_gather.device)
 
         # broadcast to all gpus
         torch.distributed.broadcast(idx_shuffle, src=0)
@@ -152,7 +152,7 @@ def forward(self, im_q, im_k):
         logits /= self.T
 
         # labels: positive key indicators
-        labels = torch.zeros(logits.shape[0], dtype=torch.long).cuda()
+        labels = torch.zeros(logits.shape[0], dtype=torch.long, device=logits.device)
 
         # dequeue and enqueue
         self._dequeue_and_enqueue(k)