From d33a21383c1ff2f20d3ea2190e398b080cd22a76 Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 16:32:44 +0300
Subject: [PATCH 1/2] Add LoRA extensions

---
 vllm_hpu_extension/punica_hpu.py | 81 ++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)
 create mode 100644 vllm_hpu_extension/punica_hpu.py

diff --git a/vllm_hpu_extension/punica_hpu.py b/vllm_hpu_extension/punica_hpu.py
new file mode 100644
index 00000000..9b726156
--- /dev/null
+++ b/vllm_hpu_extension/punica_hpu.py
@@ -0,0 +1,81 @@
+###############################################################################
+# Copyright (C) 2024 Habana Labs, Ltd. an Intel Company
+#
+# This source code is licensed under the BSD license found in the
+# LICENSE file in the root directory of this source tree.
+###############################################################################
+
+from typing import Optional, Tuple
+
+import torch
+from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
+                                    dispatch_bgmv_linear)
+
+from vllm.lora.punica import PunicaWrapper
+
+
+class GaudiPunicaWrapper(PunicaWrapper):
+
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: str):
+        super().__init__(max_num_batched_tokens, max_batches, device)
+
+    def add_lora(self,
+                 y: torch.Tensor,
+                 x: torch.Tensor,
+                 wa_t_all: torch.Tensor,
+                 wb_t_all: torch.Tensor,
+                 scale: float,
+                 y_offset: Optional[int] = None,
+                 y_slice_size: Optional[int] = None,
+                 *,
+                 buffer: Optional[torch.Tensor] = None) -> None:
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0)
+        y = y.view_as(y_org)
+
+    def add_lora_packed_nslice(self, y: torch.Tensor, x: torch.Tensor,
+                               lora_a_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               lora_b_stacked: Tuple[torch.Tensor,
+                                                     torch.Tensor,
+                                                     torch.Tensor],
+                               scale: float,
+                               output_slices: Tuple[int, ...]) -> None:
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+
+        for slice_idx in range(len(output_slices)):
+            dispatch_bgmv_linear(
+                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
+                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, 1.0)
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        wa_t_all: torch.Tensor,
+                        wb_t_all: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None) -> None:
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        dispatch_bgmv_linear(y, x, wa_t_all, wb_t_all, 0, 1.0)
+        y = y.view_as(y_org)
+
+    def add_lora_embedding(
+        self,
+        y: torch.Tensor,
+        x: torch.Tensor,
+        w_t_all: torch.Tensor,
+        add_input: bool = True,
+    ):
+        dispatch_bgmv_embedding(y, x, w_t_all, 0, 1.0)

From 243dccc51d98197dfefbce51e5a21888168060dd Mon Sep 17 00:00:00 2001
From: Konrad Zawora <kzawora@habana.ai>
Date: Fri, 20 Sep 2024 16:35:14 +0300
Subject: [PATCH 2/2] add missing change

---
 vllm_hpu_extension/ops.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm_hpu_extension/ops.py b/vllm_hpu_extension/ops.py
index 97add7b9..44c80adf 100644
--- a/vllm_hpu_extension/ops.py
+++ b/vllm_hpu_extension/ops.py
@@ -174,7 +174,6 @@ def dispatch_bgmv_linear(
     x: torch.Tensor,
     wa_t_all: torch.Tensor,
     wb_t_all: torch.Tensor,
-    indices: torch.LongTensor,
     layer_idx: int,
     scale: float,
 ):
@@ -209,7 +208,6 @@ def dispatch_bgmv_embedding(
     y: torch.Tensor,
     x: torch.Tensor,
     wb_t_all: torch.Tensor,
-    indices: torch.LongTensor,
     layer_idx: int,
     scale: float,
 ):