You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi, I'm trying DirectMLNpuInference sample, I could normally populate the input tensor to d3d resource when I select the dgpu as dml device.
However, if I select NPU as dml device, it would cause memory leak when copying data from upload type gpu buffer to default type gpu buffer when populating the values.
information
CPU: Intel(R) CoreTM Ultra 7 155U
Version 24H2
OS build 26100
NPU driver: 32.0.100.2714
GPU driver(intel): 32.0.101.5768
DirectML 1.15.1
Here's the code in main.cpp and TensorHelper.cpp I edited in sample code to check this issue. Thanks.
main.cpp
# include ...
void InitializeDirectML(ID3D12Device1** d3dDeviceOut, ID3D12CommandQueue** commandQueueOut, IDMLDevice** dmlDeviceOut,
ID3D12CommandAllocator** commandAllocatorOut, ID3D12GraphicsCommandList** commandListOut) {
// Whether to skip adapters which support Graphics in order to target NPU for testing
//bool forceComputeOnlyDevice = true;
ComPtr<IDXCoreAdapterFactory> factory;
HMODULE dxCoreModule = LoadLibraryW(L"DXCore.dll");
if (dxCoreModule)
{
auto dxcoreCreateAdapterFactory = reinterpret_cast<HRESULT(WINAPI*)(REFIID, void**)>(
GetProcAddress(dxCoreModule, "DXCoreCreateAdapterFactory")
);
if (dxcoreCreateAdapterFactory)
{
dxcoreCreateAdapterFactory(IID_PPV_ARGS(&factory));
}
}
// Create the DXCore Adapter
ComPtr<IDXCoreAdapter> adapter;
if (factory)
{
const GUID dxGUIDs[] = { DXCORE_ADAPTER_ATTRIBUTE_D3D12_GENERIC_ML };
ComPtr<IDXCoreAdapterList> adapterList;
THROW_IF_FAILED(factory->CreateAdapterList(ARRAYSIZE(dxGUIDs), dxGUIDs, IID_PPV_ARGS(&adapterList)));
for (uint32_t i = 0, adapterCount = adapterList->GetAdapterCount(); i < adapterCount; i++)
{
// i==0 igpu
// i==1 dgpu
// i==2 npu
// i==3 cpu
ComPtr<IDXCoreAdapter> nextGpuAdapter;
THROW_IF_FAILED(adapterList->GetAdapter(static_cast<uint32_t>(i), IID_PPV_ARGS(&nextGpuAdapter)));
if (nextGpuAdapter->IsAttributeSupported(DXCORE_HARDWARE_TYPE_ATTRIBUTE_NPU))
{
adapter = std::move(nextGpuAdapter);
break;
}
/*
if (i == 1) {
adapter = std::move(nextGpuAdapter);
break;
}
*/
}
}
// Create the D3D12 Device
ComPtr<ID3D12Device1> d3dDevice;
if (adapter)
{
HMODULE d3d12Module = LoadLibraryW(L"d3d12.dll");
if (d3d12Module)
{
auto d3d12CreateDevice = reinterpret_cast<HRESULT(WINAPI*)(IUnknown*, D3D_FEATURE_LEVEL, REFIID, void*)>(
GetProcAddress(d3d12Module, "D3D12CreateDevice")
);
if (d3d12CreateDevice)
{
THROW_IF_FAILED(d3d12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_1_0_GENERIC, IID_PPV_ARGS(&d3dDevice)));
}
}
}
// Create the DML Device and D3D12 Command Queue
ComPtr<IDMLDevice> dmlDevice;
ComPtr<ID3D12CommandQueue> commandQueue;
if (d3dDevice)
{
D3D12_COMMAND_QUEUE_DESC queueDesc = {};
queueDesc.Type = D3D12_COMMAND_LIST_TYPE_COMPUTE;
THROW_IF_FAILED(d3dDevice->CreateCommandQueue(
&queueDesc,
IID_PPV_ARGS(commandQueue.ReleaseAndGetAddressOf())));
HMODULE dmlModule = LoadLibraryW(L"DirectML.dll");
if (dmlModule)
{
auto dmlCreateDevice = reinterpret_cast<HRESULT(WINAPI*)(ID3D12Device*, DML_CREATE_DEVICE_FLAGS, DML_FEATURE_LEVEL, REFIID, void*)>(
GetProcAddress(dmlModule, "DMLCreateDevice1")
);
if (dmlCreateDevice)
{
THROW_IF_FAILED(dmlCreateDevice(d3dDevice.Get(), DML_CREATE_DEVICE_FLAG_NONE, DML_FEATURE_LEVEL_5_0, IID_PPV_ARGS(dmlDevice.ReleaseAndGetAddressOf())));
}
}
}
ComPtr<ID3D12CommandAllocator> commandAllocator;
ComPtr<ID3D12GraphicsCommandList> commandList;
THROW_IF_FAILED(d3dDevice->CreateCommandAllocator(
D3D12_COMMAND_LIST_TYPE_COMPUTE,
IID_PPV_ARGS(commandAllocator.ReleaseAndGetAddressOf())));
THROW_IF_FAILED(d3dDevice->CreateCommandList(
0,
D3D12_COMMAND_LIST_TYPE_COMPUTE,
commandAllocator.Get(),
nullptr,
IID_PPV_ARGS(commandList.ReleaseAndGetAddressOf())));
d3dDevice.CopyTo(d3dDeviceOut);
commandQueue.CopyTo(commandQueueOut);
dmlDevice.CopyTo(dmlDeviceOut);
commandAllocator.CopyTo(commandAllocatorOut);
commandList.CopyTo(commandListOut);
}
void main()
{
ComPtr<ID3D12Device1> d3dDevice;
ComPtr<IDMLDevice> dmlDevice;
ComPtr<ID3D12CommandQueue> commandQueue;
ComPtr<ID3D12CommandAllocator> command_allocator_;
ComPtr<ID3D12GraphicsCommandList> command_list_;
InitializeDirectML(d3dDevice.GetAddressOf(), commandQueue.GetAddressOf(), dmlDevice.GetAddressOf(),
command_allocator_.GetAddressOf(), command_list_.GetAddressOf());
// Add the DML execution provider to ORT using the DML Device and D3D12 Command Queue created above.
if (!dmlDevice)
{
printf("No NPU device found\n");
return;
}
const OrtApi& ortApi = Ort::GetApi();
static Ort::Env s_OrtEnv{ nullptr };
s_OrtEnv = Ort::Env(Ort::ThreadingOptions{});
s_OrtEnv.DisableTelemetryEvents();
auto sessionOptions = Ort::SessionOptions{};
sessionOptions.DisableMemPattern();
sessionOptions.DisablePerSessionThreads();
sessionOptions.SetExecutionMode(ExecutionMode::ORT_SEQUENTIAL);
const OrtDmlApi* ortDmlApi = nullptr;
Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));
Ort::ThrowOnError(ortDmlApi->SessionOptionsAppendExecutionProvider_DML1(sessionOptions, dmlDevice.Get(), commandQueue.Get()));
// Create the session
auto session = Ort::Session(s_OrtEnv, L"mobilenetv2-7-fp16.onnx", sessionOptions);
//const char* inputName = "input";
//const char* outputName = "output";
// Create input tensor
Ort::TypeInfo type_info = session.GetInputTypeInfo(0);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
auto input = CreateDmlValue(tensor_info, commandQueue.Get(), command_allocator_.Get(), command_list_.Get());
}
TensorHelper.cpp
#include ...
inline void ThrowOnFailed(HRESULT hr) {
if (FAILED(hr)) {
throw;
}
}
using UniqueNativePtr = std::unique_ptr<void, void (*)(void*)>;
size_t GetSizeFromType(ONNXTensorElementDataType type) {
#define CASE_FOR_TYPE(T) \
case Ort::TypeToTensorType<T>::type: { \
return sizeof(T); \
}
switch (type) {
CASE_FOR_TYPE(Ort::Float16_t);
CASE_FOR_TYPE(Ort::BFloat16_t);
CASE_FOR_TYPE(float);
CASE_FOR_TYPE(double);
CASE_FOR_TYPE(int8_t);
CASE_FOR_TYPE(int16_t);
CASE_FOR_TYPE(int32_t);
CASE_FOR_TYPE(int64_t);
CASE_FOR_TYPE(uint8_t);
CASE_FOR_TYPE(uint16_t);
CASE_FOR_TYPE(uint32_t);
CASE_FOR_TYPE(uint64_t);
CASE_FOR_TYPE(bool);
#if !defined(DISABLE_FLOAT8_TYPES)
CASE_FOR_TYPE(Ort::Float8E4M3FN_t);
CASE_FOR_TYPE(Ort::Float8E4M3FNUZ_t);
CASE_FOR_TYPE(Ort::Float8E5M2_t);
CASE_FOR_TYPE(Ort::Float8E5M2FNUZ_t);
#endif
default:
throw;
}
#undef CASE_FOR_TYPE
}
Microsoft::WRL::ComPtr<ID3D12Resource> CreateD3D12Resource(
ID3D12Device* device,
ONNXTensorElementDataType type,
const std::vector<int64_t>& shape,
D3D12_HEAP_TYPE heapType) {
// Try to allocate the backing memory for the caller
auto bufferSize =
std::accumulate(
std::begin(shape),
std::end(shape),
static_cast<int64_t>(1),
std::multiplies<int64_t>());
auto bufferByteSize = GetSizeFromType(type) * bufferSize;
// DML needs the resources' sizes to be a multiple of 4 bytes
if (bufferByteSize % 4 != 0) {
bufferByteSize += 4 - (bufferByteSize % 4);
}
auto flags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS;
if (heapType == D3D12_HEAP_TYPE_UPLOAD ||
heapType == D3D12_HEAP_TYPE_READBACK) {
flags = D3D12_RESOURCE_FLAG_NONE;
}
D3D12_HEAP_PROPERTIES heapProperties = {
heapType, D3D12_CPU_PAGE_PROPERTY_UNKNOWN, D3D12_MEMORY_POOL_UNKNOWN, 0, 0};
D3D12_RESOURCE_DESC resourceDesc = {
D3D12_RESOURCE_DIMENSION_BUFFER,
0,
static_cast<uint64_t>(bufferByteSize),
1,
1,
1,
DXGI_FORMAT_UNKNOWN,
{1, 0},
D3D12_TEXTURE_LAYOUT_ROW_MAJOR,
flags};
Microsoft::WRL::ComPtr<ID3D12Resource> resource;
ThrowOnFailed(device->CreateCommittedResource(
&heapProperties,
D3D12_HEAP_FLAG_NONE,
&resourceDesc,
D3D12_RESOURCE_STATE_COMMON,
nullptr,
__uuidof(ID3D12Resource),
&resource));
return resource;
}
void CreateD3D12ResourceOfByteSize(
ID3D12Device* d3dDevice,
size_t resourceByteSize,
Microsoft::WRL::ComPtr<ID3D12Resource>& gpuResource,
D3D12_HEAP_TYPE heapType = D3D12_HEAP_TYPE_DEFAULT,
D3D12_RESOURCE_STATES resourceState = D3D12_RESOURCE_STATE_COMMON,
D3D12_RESOURCE_FLAGS resourceFlags = D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS
)
{
resourceByteSize = std::max(resourceByteSize, size_t(DML_MINIMUM_BUFFER_TENSOR_ALIGNMENT));
// DML needs the resources' sizes to be a multiple of 4 bytes
(resourceByteSize += 3) &= ~3;
D3D12_HEAP_PROPERTIES heapProperties;
heapProperties.Type = heapType; // Default to D3D12_HEAP_TYPE_DEFAULT.
heapProperties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
heapProperties.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
heapProperties.CreationNodeMask = 1;
heapProperties.VisibleNodeMask = 1;
D3D12_RESOURCE_DESC resourceDesc;
resourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
resourceDesc.Alignment = 0;
resourceDesc.Width = static_cast<uint64_t>(resourceByteSize);
resourceDesc.Height = 1;
resourceDesc.DepthOrArraySize = 1;
resourceDesc.MipLevels = 1;
resourceDesc.Format = DXGI_FORMAT_UNKNOWN;
resourceDesc.SampleDesc = { 1, 0 };
resourceDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
resourceDesc.Flags = resourceFlags;
THROW_IF_FAILED(d3dDevice->CreateCommittedResource(
&heapProperties,
D3D12_HEAP_FLAG_NONE,
&resourceDesc,
resourceState, // Default to D3D12_RESOURCE_STATE_COMMON
nullptr,
__uuidof(ID3D12Resource),
/*out*/ (void**) gpuResource.GetAddressOf()
));
}
void WaitForQueueToComplete(ID3D12CommandQueue* queue)
{
Microsoft::WRL::ComPtr<ID3D12Device> device = nullptr;
ThrowOnFailed(queue->GetDevice(IID_PPV_ARGS(&device)));
Microsoft::WRL::ComPtr<ID3D12Fence> fence;
THROW_IF_FAILED(device->CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(fence.GetAddressOf())));
THROW_IF_FAILED(queue->Signal(fence.Get(), 1));
wil::unique_handle fenceEvent(CreateEvent(nullptr, FALSE, FALSE, nullptr));
THROW_IF_FAILED(fence->SetEventOnCompletion(1, fenceEvent.get()));
THROW_HR_IF(E_FAIL, WaitForSingleObject(fenceEvent.get(), INFINITE) != WAIT_OBJECT_0);
}
std::pair<Ort::Value, UniqueNativePtr> CreateDmlValue(
const Ort::ConstTensorTypeAndShapeInfo& tensor_info,
ID3D12CommandQueue* queue,
ID3D12CommandAllocator* commandAllocator, ID3D12GraphicsCommandList* commandList) {
auto& ortApi = Ort::GetApi();
const OrtDmlApi* ortDmlApi;
Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));
Microsoft::WRL::ComPtr<ID3D12Device> device = nullptr;
ThrowOnFailed(queue->GetDevice(IID_PPV_ARGS(&device)));
auto shape = tensor_info.GetShape();
auto resource = CreateD3D12Resource(device.Get(), tensor_info.GetElementType(), shape, D3D12_HEAP_TYPE_DEFAULT);
std::vector<std::byte> sourceData(301056);
for (int i = 0; i < 1000000; i++) {
// Get the size of the resource.
Microsoft::WRL::ComPtr<ID3D12Device> d3d12Device;
THROW_IF_FAILED(queue->GetDevice(IID_PPV_ARGS(&d3d12Device)));
D3D12_RESOURCE_DESC resourceDesc = resource->GetDesc();
assert(resourceDesc.Dimension == D3D12_RESOURCE_DIMENSION_BUFFER);
const size_t dataSizeInBytes = static_cast<size_t>(resourceDesc.Width);
// Create intermediate upload resource visible to both CPU and GPU.
Microsoft::WRL::ComPtr<ID3D12Resource> uploadBuffer;
CreateD3D12ResourceOfByteSize(d3d12Device.Get(), dataSizeInBytes, uploadBuffer,
D3D12_HEAP_TYPE_UPLOAD, D3D12_RESOURCE_STATE_GENERIC_READ, D3D12_RESOURCE_FLAG_NONE);
// Copy CPU-side data to shared memory that is both CPU and GPU visible.
size_t clampedDataByteSize = dataSizeInBytes;
std::byte* uploadBufferData = nullptr;
THROW_IF_FAILED(uploadBuffer->Map(0, nullptr, reinterpret_cast<void**>(&uploadBufferData)));
memcpy(uploadBufferData, sourceData.data(), clampedDataByteSize);
uploadBuffer->Unmap(0, nullptr);
D3D12_RESOURCE_BARRIER resourceBarrier;
resourceBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
resourceBarrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
D3D12_RESOURCE_TRANSITION_BARRIER transitionBarrier;
transitionBarrier.pResource = resource.Get();
transitionBarrier.Subresource = D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
transitionBarrier.StateBefore = D3D12_RESOURCE_STATE_COPY_DEST;// D3D12_RESOURCE_STATE_COMMON;
transitionBarrier.StateAfter = D3D12_RESOURCE_STATE_UNORDERED_ACCESS;// D3D12_RESOURCE_STATE_COPY_DEST;
resourceBarrier.Transition = transitionBarrier;
// Issue deferred command to copy from the intermediate shared resource to the final GPU resource,
// and then execute the commands.
commandList->CopyResource(resource.Get(), uploadBuffer.Get());
commandList->ResourceBarrier(1, &resourceBarrier);
THROW_IF_FAILED(commandList->Close());
ID3D12CommandList* commandLists[] = { commandList };
queue->ExecuteCommandLists(ARRAYSIZE(commandLists), commandLists);
WaitForQueueToComplete(queue);
THROW_IF_FAILED(commandAllocator->Reset());
THROW_IF_FAILED(commandList->Reset(commandAllocator, nullptr));
}
void* dmlAllocatorResource;
Ort::ThrowOnError(ortDmlApi->CreateGPUAllocationFromD3DResource(resource.Get(), &dmlAllocatorResource));
auto uniqueDmlAllocatorResource = UniqueNativePtr(dmlAllocatorResource, [](void* ptr) {
auto& ortApi = Ort::GetApi();
const OrtDmlApi* ortDmlApi;
Ort::ThrowOnError(ortApi.GetExecutionProviderApi("DML", ORT_API_VERSION, reinterpret_cast<const void**>(&ortDmlApi)));
Ort::ThrowOnError(ortDmlApi->FreeGPUAllocation(ptr));
});
Ort::MemoryInfo memoryInfo("DML", OrtAllocatorType::OrtDeviceAllocator, 0, OrtMemType::OrtMemTypeDefault);
// create the OrtValue as a tensor letting ort know that we own the data buffer
OrtValue* value;
Ort::ThrowOnError(ortApi.CreateTensorWithDataAsOrtValue(
memoryInfo,
uniqueDmlAllocatorResource.get(),
static_cast<size_t>(resource->GetDesc().Width),
shape.data(),
shape.size(),
tensor_info.GetElementType(),
&value));
return { Ort::Value(value), std::move(uniqueDmlAllocatorResource) };
}
The text was updated successfully, but these errors were encountered:
Hi, I'm trying DirectMLNpuInference sample, I could normally populate the input tensor to d3d resource when I select the dgpu as dml device.
However, if I select NPU as dml device, it would cause memory leak when copying data from upload type gpu buffer to default type gpu buffer when populating the values.
information
CPU: Intel(R) CoreTM Ultra 7 155U
Version 24H2
OS build 26100
NPU driver: 32.0.100.2714
GPU driver(intel): 32.0.101.5768
DirectML 1.15.1
Here's the code in main.cpp and TensorHelper.cpp I edited in sample code to check this issue. Thanks.
main.cpp
TensorHelper.cpp
The text was updated successfully, but these errors were encountered: