Is it the same way to populate d3d resource between NPU and GPU inference? #632

WTian-Yu · 2024-08-20T03:07:19Z

Hi, I'm trying DirectMLNpuInference sample, I could normally populate the input tensor to d3d resource when I select the dgpu as dml device.
However, if I select NPU as dml device, it would cause memory leak when copying data from upload type gpu buffer to default type gpu buffer when populating the values.

information

CPU: Intel(R) CoreTM Ultra 7 155U
Version 24H2
OS build 26100
NPU driver: 32.0.100.2714
GPU driver(intel): 32.0.101.5768
DirectML 1.15.1

Here's the code in main.cpp and TensorHelper.cpp I edited in sample code to check this issue. Thanks.

main.cpp

# include ...

void InitializeDirectML(ID3D12Device1** d3dDeviceOut, ID3D12CommandQueue** commandQueueOut, IDMLDevice** dmlDeviceOut,
    ID3D12CommandAllocator** commandAllocatorOut, ID3D12GraphicsCommandList** commandListOut) {
    // Whether to skip adapters which support Graphics in order to target NPU for testing
    //bool forceComputeOnlyDevice = true;
    ComPtr<IDXCoreAdapterFactory> factory;
    HMODULE dxCoreModule = LoadLibraryW(L"DXCore.dll");
    if (dxCoreModule)
    {
        auto dxcoreCreateAdapterFactory = reinterpret_cast<HRESULT(WINAPI*)(REFIID, void**)>(
            GetProcAddress(dxCoreModule, "DXCoreCreateAdapterFactory")
            );
        if (dxcoreCreateAdapterFactory)
        {
            dxcoreCreateAdapterFactory(IID_PPV_ARGS(&factory));
        }
    }
    // Create the DXCore Adapter
    ComPtr<IDXCoreAdapter> adapter;
    if (factory)
    {
        const GUID dxGUIDs[] = { DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML };
        ComPtr<IDXCoreAdapterList> adapterList;
        THROW_IF_FAILED(factory->CreateAdapterList(ARRAYSIZE(dxGUIDs), dxGUIDs, IID_PPV_ARGS(&adapterList)));
        for (uint32_t i = 0, adapterCount = adapterList->GetAdapterCount(); i < adapterCount; i++)
        {
            // i==0 igpu
            // i==1 dgpu
            // i==2 npu
            // i==3 cpu
            ComPtr<IDXCoreAdapter> nextGpuAdapter;
            THROW_IF_FAILED(adapterList->GetAdapter(static_cast<uint32_t>(i), IID_PPV_ARGS(&nextGpuAdapter)));
            
            if (nextGpuAdapter->IsAttributeSupported(DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU))
            {
                adapter = std::move(nextGpuAdapter);
                break;
            }
            
            /*
            if (i == 1) {
                adapter = std::move(nextGpuAdapter);
                break;
            }
            */
        }
    }
    // Create the D3D12 Device
    ComPtr<ID3D12Device1> d3dDevice;
    if (adapter)
    {
        HMODULE d3d12Module = LoadLibraryW(L"d3d12.dll");
        if (d3d12Module)
        {
            auto d3d12CreateDevice = reinterpret_cast<HRESULT(WINAPI*)(IUnknown*, D3D_FEATURE_LEVEL, REFIID, void*)>(
                GetProcAddress(d3d12Module, "D3D12CreateDevice")
                );
            if (d3d12CreateDevice)
            {
                THROW_IF_FAILED(d3d12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_1_0_GENERIC, IID_PPV_ARGS(&d3dDevice)));
            }
        }
    }
    // Create the DML Device and D3D12 Command Queue
    ComPtr<IDMLDevice> dmlDevice;
    ComPtr<ID3D12CommandQueue> commandQueue;
    if (d3dDevice)
    {
        D3D12_COMMAND_QUEUE_DESC queueDesc = {};
        queueDesc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE;
        THROW_IF_FAILED(d3dDevice->CreateCommandQueue(
            &queueDesc,
            IID_PPV_ARGS(commandQueue.ReleaseAndGetAddressOf())));
        HMODULE dmlModule = LoadLibraryW(L"DirectML.dll");
        if (dmlModule)
        {
            auto dmlCreateDevice = reinterpret_cast<HRESULT(WINAPI*)(ID3D12Device*, DML_CREATE_DEVICE_FLAGS, DML_FEATURE_LEVEL, REFIID, void*)>(
                GetProcAddress(dmlModule, "DMLCreateDevice1")
                );
            if (dmlCreateDevice)
            {
                THROW_IF_FAILED(dmlCreateDevice(d3dDevice.Get(), DML_CREATE_DEVICE_FLAG_NONE, DML_FEATURE_LEVEL_5_0, IID_PPV_ARGS(dmlDevice.ReleaseAndGetAddressOf())));
            }
        }
    }

    ComPtr<ID3D12CommandAllocator> commandAllocator;
    ComPtr<ID3D12GraphicsCommandList> commandList;

    THROW_IF_FAILED(d3dDevice->CreateCommandAllocator(
        D3D12_COMMAND_LIST_TYPE_COMPUTE,
        IID_PPV_ARGS(commandAllocator.ReleaseAndGetAddressOf())));

    THROW_IF_FAILED(d3dDevice->CreateCommandList(
        0,
        D3D12_COMMAND_LIST_TYPE_COMPUTE,
        commandAllocator.Get(),
        nullptr,
        IID_PPV_ARGS(commandList.ReleaseAndGetAddressOf())));

    d3dDevice.CopyTo(d3dDeviceOut);
    commandQueue.CopyTo(commandQueueOut);
    dmlDevice.CopyTo(dmlDeviceOut);


    commandAllocator.CopyTo(commandAllocatorOut);
    commandList.CopyTo(commandListOut);
}

void main()
{
    ComPtr<ID3D12Device1> d3dDevice;
    ComPtr<IDMLDevice> dmlDevice;
    ComPtr<ID3D12CommandQueue> commandQueue;
    ComPtr<ID3D12CommandAllocator> command_allocator_;
    ComPtr<ID3D12GraphicsCommandList> command_list_;
    InitializeDirectML(d3dDevice.GetAddressOf(), commandQueue.GetAddressOf(), dmlDevice.GetAddressOf(),
        command_allocator_.GetAddressOf(), command_list_.GetAddressOf());

    // Add the DML execution provider to ORT using the DML Device and D3D12 Command Queue created above.
    if (!dmlDevice)
    {
        printf("No NPU device found\n");
        return;
    }

    const OrtApi& ortApi = Ort::GetApi();
    static Ort::Env s_OrtEnv{ nullptr };
    s_OrtEnv = Ort::Env(Ort::ThreadingOptions{});
    s_OrtEnv.DisableTelemetryEvents();

    auto sessionOptions = Ort::SessionOptions{};
    sessionOptions.DisableMemPattern();
    sessionOptions.DisablePerSessionThreads();
    sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
    const OrtDmlApi* ortDmlApi = nullptr;
    Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));
    Ort::ThrowOnError(ortDmlApi->SessionOptionsAppendExecutionProvider_DML1(sessionOptions, dmlDevice.Get(), commandQueue.Get()));

    // Create the session
    auto session = Ort::Session(s_OrtEnv, L"mobilenetv2-7-fp16.onnx", sessionOptions);
    //const char* inputName = "input";
    //const char* outputName = "output";

    // Create input tensor
    Ort::TypeInfo type_info = session.GetInputTypeInfo(0);
    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
    auto input = CreateDmlValue(tensor_info, commandQueue.Get(), command_allocator_.Get(), command_list_.Get());
}

TensorHelper.cpp

#include ...

inline void ThrowOnFailed(HRESULT hr) {
  if (FAILED(hr)) {
      throw;
  }
}

using UniqueNativePtr = std::unique_ptr<void, void (*)(void*)>;

size_t GetSizeFromType(ONNXTensorElementDataType type) {
#define CASE_FOR_TYPE(T)                 \
  case Ort::TypeToTensorType<T>::type: { \
    return sizeof(T);                    \
  }

  switch (type) {
    CASE_FOR_TYPE(Ort::Float16_t);
    CASE_FOR_TYPE(Ort::BFloat16_t);
    CASE_FOR_TYPE(float);
    CASE_FOR_TYPE(double);
    CASE_FOR_TYPE(int8_t);
    CASE_FOR_TYPE(int16_t);
    CASE_FOR_TYPE(int32_t);
    CASE_FOR_TYPE(int64_t);
    CASE_FOR_TYPE(uint8_t);
    CASE_FOR_TYPE(uint16_t);
    CASE_FOR_TYPE(uint32_t);
    CASE_FOR_TYPE(uint64_t);
    CASE_FOR_TYPE(bool);
#if !defined(DISABLE_FLOAT8_TYPES)
    CASE_FOR_TYPE(Ort::Float8E4M3FN_t);
    CASE_FOR_TYPE(Ort::Float8E4M3FNUZ_t);
    CASE_FOR_TYPE(Ort::Float8E5M2_t);
    CASE_FOR_TYPE(Ort::Float8E5M2FNUZ_t);
#endif
    default:
        throw;
  }
#undef CASE_FOR_TYPE
}

Microsoft::WRL::ComPtr<ID3D12Resource> CreateD3D12Resource(
    ID3D12Device* device,
    ONNXTensorElementDataType type,
    const std::vector<int64_t>& shape,
    D3D12_HEAP_TYPE heapType) {
  // Try to allocate the backing memory for the caller
  auto bufferSize =
      std::accumulate(
          std::begin(shape),
          std::end(shape),
          static_cast<int64_t>(1),
          std::multiplies<int64_t>());

  auto bufferByteSize = GetSizeFromType(type) * bufferSize;

  // DML needs the resources' sizes to be a multiple of 4 bytes
  if (bufferByteSize % 4 != 0) {
    bufferByteSize += 4 - (bufferByteSize % 4);
  }

  auto flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
  if (heapType == D3D12_HEAP_TYPE_UPLOAD ||
      heapType == D3D12_HEAP_TYPE_READBACK) {
    flags = D3D12_RESOURCE_FLAG_NONE;
  }

  D3D12_HEAP_PROPERTIES heapProperties = {
      heapType, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
  D3D12_RESOURCE_DESC resourceDesc = {
      D3D12_RESOURCE_DIMENSION_BUFFER,
      0,
      static_cast<uint64_t>(bufferByteSize),
      1,
      1,
      1,
      DXGI_FORMAT_UNKNOWN,
      {1, 0},
      D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
      flags};

  Microsoft::WRL::ComPtr<ID3D12Resource> resource;
  ThrowOnFailed(device->CreateCommittedResource(
      &heapProperties,
      D3D12_HEAP_FLAG_NONE,
      &resourceDesc,
      D3D12_RESOURCE_STATE_COMMON,
      nullptr,
      __uuidof(ID3D12Resource),
      &resource));

  return resource;
}

void CreateD3D12ResourceOfByteSize(
    ID3D12Device* d3dDevice,
    size_t resourceByteSize,
    Microsoft::WRL::ComPtr<ID3D12Resource>& gpuResource,
    D3D12_HEAP_TYPE heapType = D3D12_HEAP_TYPE_DEFAULT,
    D3D12_RESOURCE_STATES resourceState = D3D12_RESOURCE_STATE_COMMON,
    D3D12_RESOURCE_FLAGS resourceFlags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS
)
{
    resourceByteSize = std::max(resourceByteSize, size_t(DML_MINIMUM_BUFFER_TENSOR_ALIGNMENT));

    // DML needs the resources' sizes to be a multiple of 4 bytes
    (resourceByteSize += 3) &= ~3;

    D3D12_HEAP_PROPERTIES heapProperties;
    heapProperties.Type = heapType; // Default to D3D12_HEAP_TYPE_DEFAULT.
    heapProperties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
    heapProperties.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
    heapProperties.CreationNodeMask = 1;
    heapProperties.VisibleNodeMask = 1;

    D3D12_RESOURCE_DESC resourceDesc;
    resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
    resourceDesc.Alignment = 0;
    resourceDesc.Width = static_cast<uint64_t>(resourceByteSize);
    resourceDesc.Height = 1;
    resourceDesc.DepthOrArraySize = 1;
    resourceDesc.MipLevels = 1;
    resourceDesc.Format = DXGI_FORMAT_UNKNOWN;
    resourceDesc.SampleDesc = { 1, 0 };
    resourceDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
    resourceDesc.Flags = resourceFlags;

    THROW_IF_FAILED(d3dDevice->CreateCommittedResource(
        &heapProperties,
        D3D12_HEAP_FLAG_NONE,
        &resourceDesc,
        resourceState, // Default to D3D12_RESOURCE_STATE_COMMON
        nullptr,
        __uuidof(ID3D12Resource),
        /*out*/ (void**) gpuResource.GetAddressOf()
    ));
}

void WaitForQueueToComplete(ID3D12CommandQueue* queue)
{
    Microsoft::WRL::ComPtr<ID3D12Device> device = nullptr;
    ThrowOnFailed(queue->GetDevice(IID_PPV_ARGS(&device)));
    Microsoft::WRL::ComPtr<ID3D12Fence> fence;
    THROW_IF_FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(fence.GetAddressOf())));
    THROW_IF_FAILED(queue->Signal(fence.Get(), 1));

    wil::unique_handle fenceEvent(CreateEvent(nullptr, FALSE, FALSE, nullptr));
    THROW_IF_FAILED(fence->SetEventOnCompletion(1, fenceEvent.get()));
    THROW_HR_IF(E_FAIL, WaitForSingleObject(fenceEvent.get(), INFINITE) != WAIT_OBJECT_0);
}

std::pair<Ort::Value, UniqueNativePtr> CreateDmlValue(
    const Ort::ConstTensorTypeAndShapeInfo& tensor_info,
    ID3D12CommandQueue* queue,
    ID3D12CommandAllocator* commandAllocator, ID3D12GraphicsCommandList* commandList) {

  auto& ortApi = Ort::GetApi();
  const OrtDmlApi* ortDmlApi;
  Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));

  Microsoft::WRL::ComPtr<ID3D12Device> device = nullptr;
  ThrowOnFailed(queue->GetDevice(IID_PPV_ARGS(&device)));

  auto shape = tensor_info.GetShape();
  auto resource = CreateD3D12Resource(device.Get(), tensor_info.GetElementType(), shape, D3D12_HEAP_TYPE_DEFAULT);


  std::vector<std::byte> sourceData(301056);

  for (int i = 0; i < 1000000; i++) {
      // Get the size of the resource.
      Microsoft::WRL::ComPtr<ID3D12Device> d3d12Device;
      THROW_IF_FAILED(queue->GetDevice(IID_PPV_ARGS(&d3d12Device)));
      D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
      assert(resourceDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
      const size_t dataSizeInBytes = static_cast<size_t>(resourceDesc.Width);

      // Create intermediate upload resource visible to both CPU and GPU.
      Microsoft::WRL::ComPtr<ID3D12Resource> uploadBuffer;
      CreateD3D12ResourceOfByteSize(d3d12Device.Get(), dataSizeInBytes, uploadBuffer,
            D3D12_HEAP_TYPE_UPLOAD, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_FLAG_NONE);


      // Copy CPU-side data to shared memory that is both CPU and GPU visible.
      size_t clampedDataByteSize = dataSizeInBytes;
      std::byte* uploadBufferData = nullptr;
      THROW_IF_FAILED(uploadBuffer->Map(0, nullptr, reinterpret_cast<void**>(&uploadBufferData)));
      memcpy(uploadBufferData, sourceData.data(), clampedDataByteSize);
      uploadBuffer->Unmap(0, nullptr);


      D3D12_RESOURCE_BARRIER resourceBarrier;
      resourceBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
      resourceBarrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
      D3D12_RESOURCE_TRANSITION_BARRIER transitionBarrier;
      transitionBarrier.pResource = resource.Get();
      transitionBarrier.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
      transitionBarrier.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST;// D3D12_RESOURCE_STATE_COMMON;
      transitionBarrier.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;// D3D12_RESOURCE_STATE_COPY_DEST;
      resourceBarrier.Transition = transitionBarrier;

      // Issue deferred command to copy from the intermediate shared resource to the final GPU resource,
      // and then execute the commands.
      commandList->CopyResource(resource.Get(), uploadBuffer.Get());
      commandList->ResourceBarrier(1, &resourceBarrier);

      THROW_IF_FAILED(commandList->Close());
      
      ID3D12CommandList* commandLists[] = { commandList };
      queue->ExecuteCommandLists(ARRAYSIZE(commandLists), commandLists);
      WaitForQueueToComplete(queue);

      THROW_IF_FAILED(commandAllocator->Reset());
      THROW_IF_FAILED(commandList->Reset(commandAllocator, nullptr));
  }

  void* dmlAllocatorResource;
  Ort::ThrowOnError(ortDmlApi->CreateGPUAllocationFromD3DResource(resource.Get(), &dmlAllocatorResource));

  auto uniqueDmlAllocatorResource = UniqueNativePtr(dmlAllocatorResource, [](void* ptr) {
    auto& ortApi = Ort::GetApi();
    const OrtDmlApi* ortDmlApi;
    Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));
    Ort::ThrowOnError(ortDmlApi->FreeGPUAllocation(ptr));
  });

  Ort::MemoryInfo memoryInfo("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);

  // create the OrtValue as a tensor letting ort know that we own the data buffer
  OrtValue* value;
  Ort::ThrowOnError(ortApi.CreateTensorWithDataAsOrtValue(
      memoryInfo,
      uniqueDmlAllocatorResource.get(),
      static_cast<size_t>(resource->GetDesc().Width),
      shape.data(),
      shape.size(),
      tensor_info.GetElementType(),
      &value));
  
  return { Ort::Value(value), std::move(uniqueDmlAllocatorResource) };
}

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Is it the same way to populate d3d resource between NPU and GPU inference? #632

Is it the same way to populate d3d resource between NPU and GPU inference? #632

WTian-Yu commented Aug 20, 2024

Is it the same way to populate d3d resource between NPU and GPU inference? #632

Is it the same way to populate d3d resource between NPU and GPU inference? #632

Comments

WTian-Yu commented Aug 20, 2024

information