oneapi-src · rafbiels · Oct 31, 2024
@@ -953,35 +953,71 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
 
 // CUDA has no memset functions that allow setting values more than 4 bytes. UR
 // API lets you pass an arbitrary "pattern" to the buffer fill, which can be
-// more than 4 bytes. We must break up the pattern into 1 byte values, and set
-// the buffer using multiple strided calls.  The first 4 patterns are set using
-// cuMemsetD32Async then all subsequent 1 byte patterns are set using
-// cuMemset2DAsync which is called for each pattern.
+// more than 4 bytes. We must break up the pattern into 1, 2 or 4-byte values
+// and set the buffer using multiple strided calls.
 ur_result_t commonMemSetLargePattern(CUstream Stream, uint32_t PatternSize,
                                      size_t Size, const void *pPattern,
                                      CUdeviceptr Ptr) {
-  // Calculate the number of patterns, stride, number of times the pattern
-  // needs to be applied, and the number of times the first 32 bit pattern
-  // needs to be applied.
-  auto NumberOfSteps = PatternSize / sizeof(uint8_t);
-  auto Pitch = NumberOfSteps * sizeof(uint8_t);
-  auto Height = Size / NumberOfSteps;
-  auto Count32 = Size / sizeof(uint32_t);
-
-  // Get 4-byte chunk of the pattern and call cuMemsetD32Async
-  auto Value = *(static_cast<const uint32_t *>(pPattern));
-  UR_CHECK_ERROR(cuMemsetD32Async(Ptr, Value, Count32, Stream));
-  for (auto step = 4u; step < NumberOfSteps; ++step) {
-    // take 1 byte of the pattern
-    Value = *(static_cast<const uint8_t *>(pPattern) + step);
-
-    // offset the pointer to the part of the buffer we want to write to
-    auto OffsetPtr = Ptr + (step * sizeof(uint8_t));
-
-    // set all of the pattern chunks
-    UR_CHECK_ERROR(cuMemsetD2D8Async(OffsetPtr, Pitch, Value, sizeof(uint8_t),
-                                     Height, Stream));
+  // Find the largest supported word size into which the pattern can be divided
+  auto BackendWordSize = PatternSize % 4u == 0u   ? 4u
+                         : PatternSize % 2u == 0u ? 2u
+                                                  : 1u;
+
+  // Calculate the number of words in the pattern, the stride, and the number of
+  // times the pattern needs to be applied
+  auto NumberOfSteps = PatternSize / BackendWordSize;
+  auto Pitch = NumberOfSteps * BackendWordSize;
+  auto Height = Size / PatternSize;
+
+  // Same implementation works for any pattern word type (uint8_t, uint16_t,
+  // uint32_t)
+  auto memsetImpl = [BackendWordSize, NumberOfSteps, Pitch, Height, Size, Ptr,
+                     &Stream](const auto *pPatternWords,
+                              auto &&continuousMemset, auto &&stridedMemset) {
+    // If the pattern is 1 word or the first word is repeated throughout, a fast
+    // continuous fill can be used without the need for slower strided fills
+    bool UseOnlyFirstValue{true};
+    for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) {
+      if (*(pPatternWords + Step) != *pPatternWords) {
+        UseOnlyFirstValue = false;
+      }
+    }
+    auto OptimizedNumberOfSteps{UseOnlyFirstValue ? 1u : NumberOfSteps};
+
+    // Fill the pattern in steps of BackendWordSize bytes. Use a continuous
+    // fill in the first step because it's faster than a strided fill. Then,
+    // overwrite the other values in subsequent steps.
+    for (auto Step{0u}; Step < OptimizedNumberOfSteps; ++Step) {
+      if (Step == 0) {
+        UR_CHECK_ERROR(continuousMemset(Ptr, *(pPatternWords),
+                                        Size / BackendWordSize, Stream));
+      } else {
+        UR_CHECK_ERROR(stridedMemset(Ptr + Step * BackendWordSize, Pitch,
+                                     *(pPatternWords + Step), 1u, Height,
+                                     Stream));
+      }
+    }
+  };
+
+  // Apply the implementation to the chosen pattern word type
+  switch (BackendWordSize) {
+  case 4u: {
+    memsetImpl(static_cast<const uint32_t *>(pPattern), cuMemsetD32Async,
+               cuMemsetD2D32Async);
+    break;
+  }
+  case 2u: {
+    memsetImpl(static_cast<const uint16_t *>(pPattern), cuMemsetD16Async,
+               cuMemsetD2D16Async);
+    break;
+  }
+  default: {
+    memsetImpl(static_cast<const uint8_t *>(pPattern), cuMemsetD8Async,
+               cuMemsetD2D8Async);
+    break;
   }
+  }
+
   return UR_RESULT_SUCCESS;
 }
 

@@ -704,25 +704,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
 
 static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
                                        size_t Size, const void *pPattern,
-                                       hipDeviceptr_t Ptr) {
+                                       hipDeviceptr_t Ptr,
+                                       uint32_t StartOffset) {
+  // Calculate the number of times the pattern needs to be applied
+  auto Height = Size / PatternSize;
 
-  // Calculate the number of patterns, stride and the number of times the
-  // pattern needs to be applied.
-  auto NumberOfSteps = PatternSize / sizeof(uint8_t);
-  auto Pitch = NumberOfSteps * sizeof(uint8_t);
-  auto Height = Size / NumberOfSteps;
-
-  for (auto step = 4u; step < NumberOfSteps; ++step) {
+  for (auto step = StartOffset; step < PatternSize; ++step) {
     // take 1 byte of the pattern
     auto Value = *(static_cast<const uint8_t *>(pPattern) + step);
 
     // offset the pointer to the part of the buffer we want to write to
-    auto OffsetPtr = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) +
-                                              (step * sizeof(uint8_t)));
+    auto OffsetPtr =
+        reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) + step);
 
     // set all of the pattern chunks
-    UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t),
-                                    Height, Stream));
+    UR_CHECK_ERROR(
+        hipMemset2DAsync(OffsetPtr, PatternSize, Value, 1u, Height, Stream));
   }
 }
 
@@ -735,11 +732,55 @@ static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
 ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
                                      size_t Size, const void *pPattern,
                                      hipDeviceptr_t Ptr) {
+  // Find the largest supported word size into which the pattern can be divided
+  auto BackendWordSize = PatternSize % 4u == 0u   ? 4u
+                         : PatternSize % 2u == 0u ? 2u
+                                                  : 1u;
+
+  // Calculate the number of patterns
+  auto NumberOfSteps = PatternSize / BackendWordSize;
+
+  // If the pattern is 1 word or the first word is repeated throughout, a fast
+  // continuous fill can be used without the need for slower strided fills
+  bool UseOnlyFirstValue{true};
+  auto checkIfFirstWordRepeats = [&UseOnlyFirstValue,
+                                  NumberOfSteps](const auto *pPatternWords) {
+    for (auto Step{1u}; (Step < NumberOfSteps) && UseOnlyFirstValue; ++Step) {
+      if (*(pPatternWords + Step) != *pPatternWords) {
+        UseOnlyFirstValue = false;
+      }
+    }
+  };
 
-  // Get 4-byte chunk of the pattern and call hipMemsetD32Async
-  auto Count32 = Size / sizeof(uint32_t);
-  auto Value = *(static_cast<const uint32_t *>(pPattern));
-  UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream));
+  // Use a continuous fill for the first word in the pattern because it's faster
+  // than a strided fill. Then, overwrite the other values in subsequent steps.
+  switch (BackendWordSize) {
+  case 4u: {
+    auto *pPatternWords = static_cast<const uint32_t *>(pPattern);
+    checkIfFirstWordRepeats(pPatternWords);
+    UR_CHECK_ERROR(
+        hipMemsetD32Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
+    break;
+  }
+  case 2u: {
+    auto *pPatternWords = static_cast<const uint16_t *>(pPattern);
+    checkIfFirstWordRepeats(pPatternWords);
+    UR_CHECK_ERROR(
+        hipMemsetD16Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
+    break;
+  }
+  default: {
+    auto *pPatternWords = static_cast<const uint8_t *>(pPattern);
+    checkIfFirstWordRepeats(pPatternWords);
+    UR_CHECK_ERROR(
+        hipMemsetD8Async(Ptr, *pPatternWords, Size / BackendWordSize, Stream));
+    break;
+  }
+  }
+
+  if (UseOnlyFirstValue) {
+    return UR_RESULT_SUCCESS;
+  }
 
   // There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D
   // to behave incorrectly when acting on host pinned memory.
@@ -753,7 +794,7 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
   // we need to check that isManaged attribute is false.
   if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) {
     const auto NumOfCopySteps = Size / PatternSize;
-    const auto Offset = sizeof(uint32_t);
+    const auto Offset = BackendWordSize;
     const auto LeftPatternSize = PatternSize - Offset;
     const auto OffsetPatternPtr = reinterpret_cast<const void *>(
         reinterpret_cast<const uint8_t *>(pPattern) + Offset);
@@ -768,10 +809,12 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
                                     Stream));
     }
   } else {
-    memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
+    memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr,
+                        BackendWordSize);
   }
 #else
-  memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
+  memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr,
+                      BackendWordSize);
 #endif
   return UR_RESULT_SUCCESS;
 }