Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Is it the same way to populate d3d resource between NPU and GPU inference? #632

Open
WTian-Yu opened this issue Aug 20, 2024 · 0 comments
Open

Comments

@WTian-Yu
Copy link

Hi, I'm trying DirectMLNpuInference sample, I could normally populate the input tensor to d3d resource when I select the dgpu as dml device.
However, if I select NPU as dml device, it would cause memory leak when copying data from upload type gpu buffer to default type gpu buffer when populating the values.

information

CPU: Intel(R) CoreTM Ultra 7 155U
Version 24H2
OS build 26100
NPU driver: 32.0.100.2714
GPU driver(intel): 32.0.101.5768
DirectML 1.15.1

Here's the code in main.cpp and TensorHelper.cpp I edited in sample code to check this issue. Thanks.

main.cpp

# include ...

void InitializeDirectML(ID3D12Device1** d3dDeviceOut, ID3D12CommandQueue** commandQueueOut, IDMLDevice** dmlDeviceOut,
    ID3D12CommandAllocator** commandAllocatorOut, ID3D12GraphicsCommandList** commandListOut) {
    // Whether to skip adapters which support Graphics in order to target NPU for testing
    //bool forceComputeOnlyDevice = true;
    ComPtr<IDXCoreAdapterFactory> factory;
    HMODULE dxCoreModule = LoadLibraryW(L"DXCore.dll");
    if (dxCoreModule)
    {
        auto dxcoreCreateAdapterFactory = reinterpret_cast<HRESULT(WINAPI*)(REFIID, void**)>(
            GetProcAddress(dxCoreModule, "DXCoreCreateAdapterFactory")
            );
        if (dxcoreCreateAdapterFactory)
        {
            dxcoreCreateAdapterFactory(IID_PPV_ARGS(&factory));
        }
    }
    // Create the DXCore Adapter
    ComPtr<IDXCoreAdapter> adapter;
    if (factory)
    {
        const GUID dxGUIDs[] = { DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML };
        ComPtr<IDXCoreAdapterList> adapterList;
        THROW_IF_FAILED(factory->CreateAdapterList(ARRAYSIZE(dxGUIDs), dxGUIDs, IID_PPV_ARGS(&adapterList)));
        for (uint32_t i = 0, adapterCount = adapterList->GetAdapterCount(); i < adapterCount; i++)
        {
            // i==0 igpu
            // i==1 dgpu
            // i==2 npu
            // i==3 cpu
            ComPtr<IDXCoreAdapter> nextGpuAdapter;
            THROW_IF_FAILED(adapterList->GetAdapter(static_cast<uint32_t>(i), IID_PPV_ARGS(&nextGpuAdapter)));
            
            if (nextGpuAdapter->IsAttributeSupported(DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU))
            {
                adapter = std::move(nextGpuAdapter);
                break;
            }
            
            /*
            if (i == 1) {
                adapter = std::move(nextGpuAdapter);
                break;
            }
            */
        }
    }
    // Create the D3D12 Device
    ComPtr<ID3D12Device1> d3dDevice;
    if (adapter)
    {
        HMODULE d3d12Module = LoadLibraryW(L"d3d12.dll");
        if (d3d12Module)
        {
            auto d3d12CreateDevice = reinterpret_cast<HRESULT(WINAPI*)(IUnknown*, D3D_FEATURE_LEVEL, REFIID, void*)>(
                GetProcAddress(d3d12Module, "D3D12CreateDevice")
                );
            if (d3d12CreateDevice)
            {
                THROW_IF_FAILED(d3d12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_1_0_GENERIC, IID_PPV_ARGS(&d3dDevice)));
            }
        }
    }
    // Create the DML Device and D3D12 Command Queue
    ComPtr<IDMLDevice> dmlDevice;
    ComPtr<ID3D12CommandQueue> commandQueue;
    if (d3dDevice)
    {
        D3D12_COMMAND_QUEUE_DESC queueDesc = {};
        queueDesc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE;
        THROW_IF_FAILED(d3dDevice->CreateCommandQueue(
            &queueDesc,
            IID_PPV_ARGS(commandQueue.ReleaseAndGetAddressOf())));
        HMODULE dmlModule = LoadLibraryW(L"DirectML.dll");
        if (dmlModule)
        {
            auto dmlCreateDevice = reinterpret_cast<HRESULT(WINAPI*)(ID3D12Device*, DML_CREATE_DEVICE_FLAGS, DML_FEATURE_LEVEL, REFIID, void*)>(
                GetProcAddress(dmlModule, "DMLCreateDevice1")
                );
            if (dmlCreateDevice)
            {
                THROW_IF_FAILED(dmlCreateDevice(d3dDevice.Get(), DML_CREATE_DEVICE_FLAG_NONE, DML_FEATURE_LEVEL_5_0, IID_PPV_ARGS(dmlDevice.ReleaseAndGetAddressOf())));
            }
        }
    }

    ComPtr<ID3D12CommandAllocator> commandAllocator;
    ComPtr<ID3D12GraphicsCommandList> commandList;

    THROW_IF_FAILED(d3dDevice->CreateCommandAllocator(
        D3D12_COMMAND_LIST_TYPE_COMPUTE,
        IID_PPV_ARGS(commandAllocator.ReleaseAndGetAddressOf())));

    THROW_IF_FAILED(d3dDevice->CreateCommandList(
        0,
        D3D12_COMMAND_LIST_TYPE_COMPUTE,
        commandAllocator.Get(),
        nullptr,
        IID_PPV_ARGS(commandList.ReleaseAndGetAddressOf())));

    d3dDevice.CopyTo(d3dDeviceOut);
    commandQueue.CopyTo(commandQueueOut);
    dmlDevice.CopyTo(dmlDeviceOut);


    commandAllocator.CopyTo(commandAllocatorOut);
    commandList.CopyTo(commandListOut);
}

void main()
{
    ComPtr<ID3D12Device1> d3dDevice;
    ComPtr<IDMLDevice> dmlDevice;
    ComPtr<ID3D12CommandQueue> commandQueue;
    ComPtr<ID3D12CommandAllocator> command_allocator_;
    ComPtr<ID3D12GraphicsCommandList> command_list_;
    InitializeDirectML(d3dDevice.GetAddressOf(), commandQueue.GetAddressOf(), dmlDevice.GetAddressOf(),
        command_allocator_.GetAddressOf(), command_list_.GetAddressOf());

    // Add the DML execution provider to ORT using the DML Device and D3D12 Command Queue created above.
    if (!dmlDevice)
    {
        printf("No NPU device found\n");
        return;
    }

    const OrtApi& ortApi = Ort::GetApi();
    static Ort::Env s_OrtEnv{ nullptr };
    s_OrtEnv = Ort::Env(Ort::ThreadingOptions{});
    s_OrtEnv.DisableTelemetryEvents();

    auto sessionOptions = Ort::SessionOptions{};
    sessionOptions.DisableMemPattern();
    sessionOptions.DisablePerSessionThreads();
    sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
    const OrtDmlApi* ortDmlApi = nullptr;
    Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));
    Ort::ThrowOnError(ortDmlApi->SessionOptionsAppendExecutionProvider_DML1(sessionOptions, dmlDevice.Get(), commandQueue.Get()));

    // Create the session
    auto session = Ort::Session(s_OrtEnv, L"mobilenetv2-7-fp16.onnx", sessionOptions);
    //const char* inputName = "input";
    //const char* outputName = "output";

    // Create input tensor
    Ort::TypeInfo type_info = session.GetInputTypeInfo(0);
    auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
    auto input = CreateDmlValue(tensor_info, commandQueue.Get(), command_allocator_.Get(), command_list_.Get());
}

TensorHelper.cpp

#include ...

inline void ThrowOnFailed(HRESULT hr) {
  if (FAILED(hr)) {
      throw;
  }
}

using UniqueNativePtr = std::unique_ptr<void, void (*)(void*)>;

size_t GetSizeFromType(ONNXTensorElementDataType type) {
#define CASE_FOR_TYPE(T)                 \
  case Ort::TypeToTensorType<T>::type: { \
    return sizeof(T);                    \
  }

  switch (type) {
    CASE_FOR_TYPE(Ort::Float16_t);
    CASE_FOR_TYPE(Ort::BFloat16_t);
    CASE_FOR_TYPE(float);
    CASE_FOR_TYPE(double);
    CASE_FOR_TYPE(int8_t);
    CASE_FOR_TYPE(int16_t);
    CASE_FOR_TYPE(int32_t);
    CASE_FOR_TYPE(int64_t);
    CASE_FOR_TYPE(uint8_t);
    CASE_FOR_TYPE(uint16_t);
    CASE_FOR_TYPE(uint32_t);
    CASE_FOR_TYPE(uint64_t);
    CASE_FOR_TYPE(bool);
#if !defined(DISABLE_FLOAT8_TYPES)
    CASE_FOR_TYPE(Ort::Float8E4M3FN_t);
    CASE_FOR_TYPE(Ort::Float8E4M3FNUZ_t);
    CASE_FOR_TYPE(Ort::Float8E5M2_t);
    CASE_FOR_TYPE(Ort::Float8E5M2FNUZ_t);
#endif
    default:
        throw;
  }
#undef CASE_FOR_TYPE
}

Microsoft::WRL::ComPtr<ID3D12Resource> CreateD3D12Resource(
    ID3D12Device* device,
    ONNXTensorElementDataType type,
    const std::vector<int64_t>& shape,
    D3D12_HEAP_TYPE heapType) {
  // Try to allocate the backing memory for the caller
  auto bufferSize =
      std::accumulate(
          std::begin(shape),
          std::end(shape),
          static_cast<int64_t>(1),
          std::multiplies<int64_t>());

  auto bufferByteSize = GetSizeFromType(type) * bufferSize;

  // DML needs the resources' sizes to be a multiple of 4 bytes
  if (bufferByteSize % 4 != 0) {
    bufferByteSize += 4 - (bufferByteSize % 4);
  }

  auto flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
  if (heapType == D3D12_HEAP_TYPE_UPLOAD ||
      heapType == D3D12_HEAP_TYPE_READBACK) {
    flags = D3D12_RESOURCE_FLAG_NONE;
  }

  D3D12_HEAP_PROPERTIES heapProperties = {
      heapType, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
  D3D12_RESOURCE_DESC resourceDesc = {
      D3D12_RESOURCE_DIMENSION_BUFFER,
      0,
      static_cast<uint64_t>(bufferByteSize),
      1,
      1,
      1,
      DXGI_FORMAT_UNKNOWN,
      {1, 0},
      D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
      flags};

  Microsoft::WRL::ComPtr<ID3D12Resource> resource;
  ThrowOnFailed(device->CreateCommittedResource(
      &heapProperties,
      D3D12_HEAP_FLAG_NONE,
      &resourceDesc,
      D3D12_RESOURCE_STATE_COMMON,
      nullptr,
      __uuidof(ID3D12Resource),
      &resource));

  return resource;
}

void CreateD3D12ResourceOfByteSize(
    ID3D12Device* d3dDevice,
    size_t resourceByteSize,
    Microsoft::WRL::ComPtr<ID3D12Resource>& gpuResource,
    D3D12_HEAP_TYPE heapType = D3D12_HEAP_TYPE_DEFAULT,
    D3D12_RESOURCE_STATES resourceState = D3D12_RESOURCE_STATE_COMMON,
    D3D12_RESOURCE_FLAGS resourceFlags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS
)
{
    resourceByteSize = std::max(resourceByteSize, size_t(DML_MINIMUM_BUFFER_TENSOR_ALIGNMENT));

    // DML needs the resources' sizes to be a multiple of 4 bytes
    (resourceByteSize += 3) &= ~3;

    D3D12_HEAP_PROPERTIES heapProperties;
    heapProperties.Type = heapType; // Default to D3D12_HEAP_TYPE_DEFAULT.
    heapProperties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
    heapProperties.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
    heapProperties.CreationNodeMask = 1;
    heapProperties.VisibleNodeMask = 1;

    D3D12_RESOURCE_DESC resourceDesc;
    resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
    resourceDesc.Alignment = 0;
    resourceDesc.Width = static_cast<uint64_t>(resourceByteSize);
    resourceDesc.Height = 1;
    resourceDesc.DepthOrArraySize = 1;
    resourceDesc.MipLevels = 1;
    resourceDesc.Format = DXGI_FORMAT_UNKNOWN;
    resourceDesc.SampleDesc = { 1, 0 };
    resourceDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
    resourceDesc.Flags = resourceFlags;

    THROW_IF_FAILED(d3dDevice->CreateCommittedResource(
        &heapProperties,
        D3D12_HEAP_FLAG_NONE,
        &resourceDesc,
        resourceState, // Default to D3D12_RESOURCE_STATE_COMMON
        nullptr,
        __uuidof(ID3D12Resource),
        /*out*/ (void**) gpuResource.GetAddressOf()
    ));
}

void WaitForQueueToComplete(ID3D12CommandQueue* queue)
{
    Microsoft::WRL::ComPtr<ID3D12Device> device = nullptr;
    ThrowOnFailed(queue->GetDevice(IID_PPV_ARGS(&device)));
    Microsoft::WRL::ComPtr<ID3D12Fence> fence;
    THROW_IF_FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(fence.GetAddressOf())));
    THROW_IF_FAILED(queue->Signal(fence.Get(), 1));

    wil::unique_handle fenceEvent(CreateEvent(nullptr, FALSE, FALSE, nullptr));
    THROW_IF_FAILED(fence->SetEventOnCompletion(1, fenceEvent.get()));
    THROW_HR_IF(E_FAIL, WaitForSingleObject(fenceEvent.get(), INFINITE) != WAIT_OBJECT_0);
}

std::pair<Ort::Value, UniqueNativePtr> CreateDmlValue(
    const Ort::ConstTensorTypeAndShapeInfo& tensor_info,
    ID3D12CommandQueue* queue,
    ID3D12CommandAllocator* commandAllocator, ID3D12GraphicsCommandList* commandList) {

  auto& ortApi = Ort::GetApi();
  const OrtDmlApi* ortDmlApi;
  Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));

  Microsoft::WRL::ComPtr<ID3D12Device> device = nullptr;
  ThrowOnFailed(queue->GetDevice(IID_PPV_ARGS(&device)));

  auto shape = tensor_info.GetShape();
  auto resource = CreateD3D12Resource(device.Get(), tensor_info.GetElementType(), shape, D3D12_HEAP_TYPE_DEFAULT);


  std::vector<std::byte> sourceData(301056);

  for (int i = 0; i < 1000000; i++) {
      // Get the size of the resource.
      Microsoft::WRL::ComPtr<ID3D12Device> d3d12Device;
      THROW_IF_FAILED(queue->GetDevice(IID_PPV_ARGS(&d3d12Device)));
      D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
      assert(resourceDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
      const size_t dataSizeInBytes = static_cast<size_t>(resourceDesc.Width);

      // Create intermediate upload resource visible to both CPU and GPU.
      Microsoft::WRL::ComPtr<ID3D12Resource> uploadBuffer;
      CreateD3D12ResourceOfByteSize(d3d12Device.Get(), dataSizeInBytes, uploadBuffer,
            D3D12_HEAP_TYPE_UPLOAD, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_FLAG_NONE);


      // Copy CPU-side data to shared memory that is both CPU and GPU visible.
      size_t clampedDataByteSize = dataSizeInBytes;
      std::byte* uploadBufferData = nullptr;
      THROW_IF_FAILED(uploadBuffer->Map(0, nullptr, reinterpret_cast<void**>(&uploadBufferData)));
      memcpy(uploadBufferData, sourceData.data(), clampedDataByteSize);
      uploadBuffer->Unmap(0, nullptr);


      D3D12_RESOURCE_BARRIER resourceBarrier;
      resourceBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
      resourceBarrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
      D3D12_RESOURCE_TRANSITION_BARRIER transitionBarrier;
      transitionBarrier.pResource = resource.Get();
      transitionBarrier.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
      transitionBarrier.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST;// D3D12_RESOURCE_STATE_COMMON;
      transitionBarrier.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;// D3D12_RESOURCE_STATE_COPY_DEST;
      resourceBarrier.Transition = transitionBarrier;

      // Issue deferred command to copy from the intermediate shared resource to the final GPU resource,
      // and then execute the commands.
      commandList->CopyResource(resource.Get(), uploadBuffer.Get());
      commandList->ResourceBarrier(1, &resourceBarrier);

      THROW_IF_FAILED(commandList->Close());
      
      ID3D12CommandList* commandLists[] = { commandList };
      queue->ExecuteCommandLists(ARRAYSIZE(commandLists), commandLists);
      WaitForQueueToComplete(queue);

      THROW_IF_FAILED(commandAllocator->Reset());
      THROW_IF_FAILED(commandList->Reset(commandAllocator, nullptr));
  }

  void* dmlAllocatorResource;
  Ort::ThrowOnError(ortDmlApi->CreateGPUAllocationFromD3DResource(resource.Get(), &dmlAllocatorResource));

  auto uniqueDmlAllocatorResource = UniqueNativePtr(dmlAllocatorResource, [](void* ptr) {
    auto& ortApi = Ort::GetApi();
    const OrtDmlApi* ortDmlApi;
    Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));
    Ort::ThrowOnError(ortDmlApi->FreeGPUAllocation(ptr));
  });

  Ort::MemoryInfo memoryInfo("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);

  // create the OrtValue as a tensor letting ort know that we own the data buffer
  OrtValue* value;
  Ort::ThrowOnError(ortApi.CreateTensorWithDataAsOrtValue(
      memoryInfo,
      uniqueDmlAllocatorResource.get(),
      static_cast<size_t>(resource->GetDesc().Width),
      shape.data(),
      shape.size(),
      tensor_info.GetElementType(),
      &value));
  
  return { Ort::Value(value), std::move(uniqueDmlAllocatorResource) };
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant