From b9b7753ba5a0fa280c837a7c4f088f9c637ad575 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20M=C3=BCller?= Date: Sun, 9 Jul 2023 11:34:03 +0200 Subject: [PATCH] Replace GLM with tcnn's vector math, bugfixes and extra bindings --- .gitmodules | 3 - CMakeLists.txt | 10 +- dependencies/glm | 1 - dependencies/tiny-cuda-nn | 2 +- .../adam_optimizer.h | 6 +- .../bounding_box.cuh | 36 +- .../neural-graphics-primitives/camera_path.h | 10 +- include/neural-graphics-primitives/common.h | 194 +--- .../common_device.cuh | 184 +--- .../neural-graphics-primitives/common_host.h | 170 ++++ .../discrete_distribution.h | 6 +- include/neural-graphics-primitives/dlss.h | 4 +- include/neural-graphics-primitives/envmap.cuh | 17 +- .../neural-graphics-primitives/json_binding.h | 137 +-- .../marching_cubes.h | 41 +- include/neural-graphics-primitives/nerf.h | 77 +- .../nerf_device.cuh | 617 +++++++++++ .../neural-graphics-primitives/nerf_loader.h | 54 +- .../neural-graphics-primitives/nerf_network.h | 166 +-- .../neural-graphics-primitives/openxr_hmd.h | 4 +- .../neural-graphics-primitives/random_val.cuh | 14 +- .../render_buffer.h | 12 +- include/neural-graphics-primitives/sdf.h | 10 +- .../neural-graphics-primitives/shared_queue.h | 4 +- .../takikawa_encoding.cuh | 118 +-- include/neural-graphics-primitives/testbed.h | 237 +++-- .../neural-graphics-primitives/thread_pool.h | 4 +- .../tinyexr_wrapper.h | 6 +- .../tinyobj_loader_wrapper.h | 4 +- .../trainable_buffer.cuh | 32 +- .../neural-graphics-primitives/triangle.cuh | 29 +- .../triangle_bvh.cuh | 8 +- .../triangle_octree.cuh | 18 +- scripts/run.py | 3 +- src/camera_path.cu | 16 +- src/common_device.cu | 247 ----- src/{common.cu => common_host.cu} | 86 +- src/dlss.cu | 14 +- src/main.cu | 5 +- src/marching_cubes.cu | 49 +- src/nerf_loader.cu | 31 +- src/openxr_hmd.cu | 8 +- src/optix/pathescape.cu | 7 +- src/optix/pathescape.h | 4 +- src/optix/program.h | 5 +- src/optix/raystab.cu | 6 +- src/optix/raystab.h | 4 +- src/optix/raytrace.cu | 5 +- src/optix/raytrace.h | 4 +- src/python_api.cu | 32 +- src/render_buffer.cu | 62 +- src/testbed.cu | 314 +++--- src/testbed_image.cu | 50 +- src/testbed_nerf.cu | 958 +++++------------- src/testbed_sdf.cu | 42 +- src/testbed_volume.cu | 119 ++- src/thread_pool.cpp | 4 +- src/tinyexr_wrapper.cu | 8 +- ..._wrapper.cpp => tinyobj_loader_wrapper.cu} | 7 +- src/triangle_bvh.cu | 17 +- 60 files changed, 2023 insertions(+), 2319 deletions(-) delete mode 160000 dependencies/glm create mode 100644 include/neural-graphics-primitives/common_host.h create mode 100644 include/neural-graphics-primitives/nerf_device.cuh delete mode 100644 src/common_device.cu rename src/{common.cu => common_host.cu} (76%) rename src/{tinyobj_loader_wrapper.cpp => tinyobj_loader_wrapper.cu} (95%) diff --git a/.gitmodules b/.gitmodules index 4ef3aef16..1c90e8436 100644 --- a/.gitmodules +++ b/.gitmodules @@ -28,6 +28,3 @@ [submodule "dependencies/OpenXR-SDK"] path = dependencies/OpenXR-SDK url = https://github.com/KhronosGroup/OpenXR-SDK.git -[submodule "dependencies/glm"] - path = dependencies/glm - url = https://github.com/g-truc/glm diff --git a/CMakeLists.txt b/CMakeLists.txt index d6ac83e9a..9fce957e7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,6 @@ if (MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP24") else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") endif() @@ -76,7 +75,6 @@ if (MSVC) else() list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=-Wno-float-conversion") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=-fno-strict-aliasing") - list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=-fms-extensions") list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=-fPIC") endif() list(APPEND CUDA_NVCC_FLAGS "--extended-lambda") @@ -203,7 +201,6 @@ endif(NGP_BUILD_WITH_GUI) list(APPEND NGP_INCLUDE_DIRECTORIES "dependencies" "dependencies/filesystem" - "dependencies/glm" "dependencies/nanovdb" "dependencies/NaturalSort" "dependencies/tinylogger" @@ -261,8 +258,7 @@ endif() list(APPEND NGP_SOURCES ${GUI_SOURCES} src/camera_path.cu - src/common.cu - src/common_device.cu + src/common_host.cu src/marching_cubes.cu src/nerf_loader.cu src/render_buffer.cu @@ -273,7 +269,7 @@ list(APPEND NGP_SOURCES src/testbed_volume.cu src/thread_pool.cpp src/tinyexr_wrapper.cu - src/tinyobj_loader_wrapper.cpp + src/tinyobj_loader_wrapper.cu src/triangle_bvh.cu ) @@ -284,6 +280,8 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${CMAKE_BINARY_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${CMAKE_BINARY_DIR}) get_filename_component(CUDA_COMPILER_BIN "${CMAKE_CUDA_COMPILER}" DIRECTORY) +get_filename_component(CUDA_DIR "${CUDA_COMPILER_BIN}" DIRECTORY) +set(CUDA_INCLUDE "${CUDA_DIR}/include") if (NGP_OPTIX) add_library(optix_program OBJECT diff --git a/dependencies/glm b/dependencies/glm deleted file mode 160000 index efec5db08..000000000 --- a/dependencies/glm +++ /dev/null @@ -1 +0,0 @@ -Subproject commit efec5db081e3aad807d0731e172ac597f6a39447 diff --git a/dependencies/tiny-cuda-nn b/dependencies/tiny-cuda-nn index 8d2536b8b..28ca991f9 160000 --- a/dependencies/tiny-cuda-nn +++ b/dependencies/tiny-cuda-nn @@ -1 +1 @@ -Subproject commit 8d2536b8b324c998ff0ecec74e9d6a9c77bd45f3 +Subproject commit 28ca991f99b44d10387d73077c07ccfdd7f96275 diff --git a/include/neural-graphics-primitives/adam_optimizer.h b/include/neural-graphics-primitives/adam_optimizer.h index d62b83908..17cee1155 100644 --- a/include/neural-graphics-primitives/adam_optimizer.h +++ b/include/neural-graphics-primitives/adam_optimizer.h @@ -20,7 +20,7 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { class VarAdamOptimizer { public: @@ -241,7 +241,7 @@ class RotationAdamOptimizer { float actual_learning_rate = m_hparams.learning_rate * std::sqrt(1 - std::pow(m_hparams.beta2, m_state.iter)) / (1 - std::pow(m_hparams.beta1, m_state.iter)); m_state.first_moment = m_hparams.beta1 * m_state.first_moment + (1 - m_hparams.beta1) * gradient; m_state.second_moment = m_hparams.beta2 * m_state.second_moment + (1 - m_hparams.beta2) * gradient * gradient; - vec3 rot = actual_learning_rate * m_state.first_moment / (sqrt(m_state.second_moment) + vec3(m_hparams.epsilon)); + vec3 rot = actual_learning_rate * m_state.first_moment / (sqrt(m_state.second_moment) + m_hparams.epsilon); m_state.variable = rotvec(rotmat(-rot) * rotmat(variable())); } @@ -308,4 +308,4 @@ inline void from_json(const nlohmann::json& j, RotationAdamOptimizer& opt) { opt.from_json(j); } -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/bounding_box.cuh b/include/neural-graphics-primitives/bounding_box.cuh index d7a083e15..038bfb8e0 100644 --- a/include/neural-graphics-primitives/bounding_box.cuh +++ b/include/neural-graphics-primitives/bounding_box.cuh @@ -19,7 +19,7 @@ #include #include -NGP_NAMESPACE_BEGIN +namespace ngp { template NGP_HOST_DEVICE inline void project(vec3 points[N_POINTS], const vec3& axis, float& min, float& max) { @@ -51,7 +51,7 @@ struct BoundingBox { enlarge(tri.c); } - BoundingBox(std::vector::iterator begin, std::vector::iterator end) { + NGP_HOST_DEVICE BoundingBox(Triangle* begin, Triangle* end) { min = max = begin->a; for (auto it = begin; it != end; ++it) { enlarge(*it); @@ -59,8 +59,8 @@ struct BoundingBox { } NGP_HOST_DEVICE void enlarge(const BoundingBox& other) { - min = glm::min(min, other.min); - max = glm::max(max, other.max); + min = tcnn::min(min, other.min); + max = tcnn::max(max, other.max); } NGP_HOST_DEVICE void enlarge(const Triangle& tri) { @@ -70,8 +70,8 @@ struct BoundingBox { } NGP_HOST_DEVICE void enlarge(const vec3& point) { - min = glm::min(min, point); - max = glm::max(max, point); + min = tcnn::min(min, point); + max = tcnn::max(max, point); } NGP_HOST_DEVICE void inflate(float amount) { @@ -93,8 +93,8 @@ struct BoundingBox { NGP_HOST_DEVICE BoundingBox intersection(const BoundingBox& other) const { BoundingBox result = *this; - result.min = glm::max(result.min, other.min); - result.max = glm::min(result.max, other.max); + result.min = tcnn::max(result.min, other.min); + result.max = tcnn::min(result.max, other.max); return result; } @@ -165,14 +165,14 @@ struct BoundingBox { float tmax = (max.x - pos.x) / dir.x; if (tmin > tmax) { - tcnn::host_device_swap(tmin, tmax); + host_device_swap(tmin, tmax); } float tymin = (min.y - pos.y) / dir.y; float tymax = (max.y - pos.y) / dir.y; if (tymin > tymax) { - tcnn::host_device_swap(tymin, tymax); + host_device_swap(tymin, tymax); } if (tmin > tymax || tymin > tmax) { @@ -191,7 +191,7 @@ struct BoundingBox { float tzmax = (max.z - pos.z) / dir.z; if (tzmin > tzmax) { - tcnn::host_device_swap(tzmin, tzmax); + host_device_swap(tzmin, tzmax); } if (tmin > tzmax || tzmin > tmax) { @@ -210,7 +210,7 @@ struct BoundingBox { } NGP_HOST_DEVICE bool is_empty() const { - return any(lessThan(max, min)); + return max.x < min.x || max.y < min.y || max.z < min.z; } NGP_HOST_DEVICE bool contains(const vec3& p) const { @@ -226,12 +226,12 @@ struct BoundingBox { } NGP_HOST_DEVICE float distance_sq(const vec3& p) const { - return length2(glm::max(glm::max(min - p, p - max), vec3(0.0f))); + return length2(tcnn::max(tcnn::max(min - p, p - max), vec3(0.0f))); } NGP_HOST_DEVICE float signed_distance(const vec3& p) const { vec3 q = abs(p - min) - diag(); - return length(glm::max(q, vec3(0.0f))) + std::min(compMax(q), 0.0f); + return length(tcnn::max(q, vec3(0.0f))) + std::min(tcnn::max(q), 0.0f); } NGP_HOST_DEVICE void get_vertices(vec3 v[8]) const { @@ -249,12 +249,4 @@ struct BoundingBox { vec3 max = vec3(-std::numeric_limits::infinity()); }; -inline std::ostream& operator<<(std::ostream& os, const ngp::BoundingBox& bb) { - os << "["; - os << "min=[" << bb.min.x << "," << bb.min.y << "," << bb.min.z << "], "; - os << "max=[" << bb.max.x << "," << bb.max.y << "," << bb.max.z << "]"; - os << "]"; - return os; } - -NGP_NAMESPACE_END diff --git a/include/neural-graphics-primitives/camera_path.h b/include/neural-graphics-primitives/camera_path.h index f379177a4..c3530e912 100644 --- a/include/neural-graphics-primitives/camera_path.h +++ b/include/neural-graphics-primitives/camera_path.h @@ -14,7 +14,7 @@ #pragma once -#include +#include #include @@ -28,7 +28,7 @@ struct ImDrawList; -NGP_NAMESPACE_BEGIN +namespace ngp { struct CameraKeyframe { quat R; @@ -40,7 +40,7 @@ struct CameraKeyframe { int glow_mode; float glow_y_cutoff; mat4x3 m() const { - auto rot = toMat3(normalize(quat(R))); + auto rot = to_mat3(normalize(quat(R))); return mat4x3(rot[0], rot[1], rot[2], T); } @@ -113,7 +113,7 @@ struct CameraPath { // add size to ensure no negative value is generated by modulo return keyframes[(i + size) % size]; } else { - return keyframes[tcnn::clamp(i, 0, (int)keyframes.size()-1)]; + return keyframes[clamp(i, 0, (int)keyframes.size()-1)]; } } CameraKeyframe eval_camera_path(float t) { @@ -142,5 +142,5 @@ void visualize_cube(ImDrawList* list, const mat4& world2proj, const vec3& a, con void visualize_nerf_camera(ImDrawList* list, const mat4& world2proj, const mat4x3& xform, float aspect, uint32_t col = 0x80ffffff, float thickness = 1.0f); #endif -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/common.h b/include/neural-graphics-primitives/common.h index 86dec3bbd..c6f503288 100644 --- a/include/neural-graphics-primitives/common.h +++ b/include/neural-graphics-primitives/common.h @@ -15,83 +15,29 @@ #pragma once - -#include - -#ifdef __NVCC__ -# ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ -# pragma nv_diag_suppress = unsigned_compare_with_zero -# pragma nv_diag_suppress 20011 -# pragma nv_diag_suppress 20014 -# else -# pragma diag_suppress = unsigned_compare_with_zero -# pragma diag_suppress 20011 -# pragma diag_suppress 20014 -# endif -#endif - -// For glm swizzles to work correctly, Microsoft extensions -// need to be enabled. This is done by the -fms-extensions -// flag (see CMakeLists.txt), and the following macro needs -// to be defined such that GLM is aware of this. -#ifndef _MSC_EXTENSIONS -#define _MSC_EXTENSIONS +#ifdef _WIN32 +# define NOMINMAX #endif -#define GLM_FORCE_SWIZZLE -#include -#include -#include -#include -#include -using namespace glm; +#include +using namespace tcnn; -#define NGP_NAMESPACE_BEGIN namespace ngp { -#define NGP_NAMESPACE_END } #if defined(__CUDA_ARCH__) - #if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__)) - #define NGP_PRAGMA_UNROLL _Pragma("unroll") - #define NGP_PRAGMA_NO_UNROLL _Pragma("unroll 1") - #else - #define NGP_PRAGMA_UNROLL #pragma unroll - #define NGP_PRAGMA_NO_UNROLL #pragma unroll 1 - #endif + #define NGP_PRAGMA_UNROLL _Pragma("unroll") + #define NGP_PRAGMA_NO_UNROLL _Pragma("unroll 1") #else #define NGP_PRAGMA_UNROLL #define NGP_PRAGMA_NO_UNROLL #endif -#include - -#include -#include - -#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__)) +#if defined(__CUDACC__) || (defined(__clang__) && defined(__CUDA__)) #define NGP_HOST_DEVICE __host__ __device__ #else #define NGP_HOST_DEVICE #endif -NGP_NAMESPACE_BEGIN - -namespace fs = filesystem; - -bool is_wsl(); - -fs::path get_executable_dir(); -fs::path get_root_dir(); - -#ifdef _WIN32 -std::string utf16_to_utf8(const std::wstring& utf16); -std::wstring utf8_to_utf16(const std::string& utf16); -std::wstring native_string(const fs::path& path); -#else -std::string native_string(const fs::path& path); -#endif - -bool ends_with(const std::string& str, const std::string& ending); -bool ends_with_case_insensitive(const std::string& str, const std::string& ending); +namespace ngp { enum class EMeshRenderMode : int { Off, @@ -191,10 +137,6 @@ enum class ETestbedMode : int { None, }; -ETestbedMode mode_from_scene(const std::string& scene); -ETestbedMode mode_from_string(const std::string& str); -std::string to_string(ETestbedMode); - enum class EMlpAlgorithm : int { MMA, FMA, @@ -234,7 +176,7 @@ struct Ray { }; struct TrainingXForm { - bool operator==(const TrainingXForm& other) const { + NGP_HOST_DEVICE bool operator==(const TrainingXForm& other) const { return start == other.start && end == other.end; } @@ -252,7 +194,7 @@ enum class ELensMode : int { }; static constexpr const char* LensModeStr = "Perspective\0OpenCV\0F-Theta\0LatLong\0OpenCV Fisheye\0Equirectangular\0\0"; -inline bool supports_dlss(ELensMode mode) { +inline NGP_HOST_DEVICE bool supports_dlss(ELensMode mode) { return mode == ELensMode::Perspective || mode == ELensMode::OpenCV || mode == ELensMode::OpenCVFisheye; } @@ -261,10 +203,6 @@ struct Lens { float params[7] = {}; }; -inline NGP_HOST_DEVICE float sign(float x) { - return copysignf(1.0, x); -} - inline NGP_HOST_DEVICE uint32_t binary_search(float val, const float* data, uint32_t length) { if (length == 0) { return 0; @@ -287,115 +225,41 @@ inline NGP_HOST_DEVICE uint32_t binary_search(float val, const float* data, uint } } - return std::min(first, length-1); -} - -inline std::string replace_all(std::string str, const std::string& a, const std::string& b) { - std::string::size_type n = 0; - while ((n = str.find(a, n)) != std::string::npos) { - str.replace(n, a.length(), b); - n += b.length(); - } - return str; -} - -template -std::string join(const T& components, const std::string& delim) { - std::ostringstream s; - for (const auto& component : components) { - if (&components[0] != &component) { - s << delim; - } - s << component; - } - - return s.str(); + return min(first, length-1); } -enum class EEmaType { - Time, - Step, -}; - -class Ema { -public: - Ema(EEmaType type, float half_life) - : m_type{type}, m_decay{std::pow(0.5f, 1.0f / half_life)}, m_creation_time{std::chrono::steady_clock::now()} {} - - int64_t current_progress() { - if (m_type == EEmaType::Time) { - auto now = std::chrono::steady_clock::now(); - return std::chrono::duration_cast(now - m_creation_time).count(); - } else { - return m_last_progress + 1; - } - } - - void update(float val) { - int64_t cur = current_progress(); - int64_t elapsed = cur - m_last_progress; - m_last_progress = cur; - - float decay = std::pow(m_decay, elapsed); - m_val = val; - m_ema_val = decay * m_ema_val + (1.0f - decay) * val; - } - - void set(float val) { - m_last_progress = current_progress(); - m_val = m_ema_val = val; - } - - float val() const { - return m_val; - } - - float ema_val() const { - return m_ema_val; - } - -private: - float m_val = 0.0f; - float m_ema_val = 0.0f; - EEmaType m_type; - float m_decay; - - int64_t m_last_progress = 0; - std::chrono::time_point m_creation_time; -}; - template struct Buffer2DView { T* data = nullptr; - ivec2 resolution = ivec2(0); + ivec2 resolution = 0; // Lookup via integer pixel position (no bounds checking) - NGP_HOST_DEVICE T at(const ivec2& xy) const { - return data[xy.x + xy.y * resolution.x]; + NGP_HOST_DEVICE T at(const ivec2& px) const { + return data[px.x + px.y * resolution.x]; } // Lookup via UV coordinates in [0,1]^2 NGP_HOST_DEVICE T at(const vec2& uv) const { - ivec2 xy = clamp(ivec2(vec2(resolution) * uv), ivec2(0), resolution - ivec2(1)); - return at(xy); + ivec2 px = clamp(ivec2(vec2(resolution) * uv), 0, resolution - 1); + return at(px); } // Lookup via UV coordinates in [0,1]^2 and LERP the nearest texels NGP_HOST_DEVICE T at_lerp(const vec2& uv) const { - const vec2 xy_float = vec2(resolution) * uv; - const ivec2 xy = ivec2(xy_float); + const vec2 px_float = vec2(resolution) * uv; + const ivec2 px = ivec2(px_float); - const vec2 weight = xy_float - vec2(xy); + const vec2 weight = px_float - vec2(px); auto read_val = [&](ivec2 pos) { - return at(clamp(pos, ivec2(0), resolution - ivec2(1))); + return at(clamp(pos, 0, resolution - 1)); }; return ( - (1 - weight.x) * (1 - weight.y) * read_val({xy.x, xy.y}) + - (weight.x) * (1 - weight.y) * read_val({xy.x+1, xy.y}) + - (1 - weight.x) * (weight.y) * read_val({xy.x, xy.y+1}) + - (weight.x) * (weight.y) * read_val({xy.x+1, xy.y+1}) + (1 - weight.x) * (1 - weight.y) * read_val({px.x, px.y}) + + (weight.x) * (1 - weight.y) * read_val({px.x+1, px.y}) + + (1 - weight.x) * (weight.y) * read_val({px.x, px.y+1}) + + (weight.x) * (weight.y) * read_val({px.x+1, px.y+1}) ); } @@ -404,12 +268,4 @@ struct Buffer2DView { } }; -uint8_t* load_stbi(const fs::path& path, int* width, int* height, int* comp, int req_comp); -float* load_stbi_float(const fs::path& path, int* width, int* height, int* comp, int req_comp); -uint16_t* load_stbi_16(const fs::path& path, int* width, int* height, int* comp, int req_comp); -bool is_hdr_stbi(const fs::path& path); -int write_stbi(const fs::path& path, int width, int height, int comp, const uint8_t* pixels, int quality = 100); - -FILE* native_fopen(const fs::path& path, const char* mode); - -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/common_device.cuh b/include/neural-graphics-primitives/common_device.cuh index d70d4fe83..dfcdbb791 100644 --- a/include/neural-graphics-primitives/common_device.cuh +++ b/include/neural-graphics-primitives/common_device.cuh @@ -19,11 +19,10 @@ #include #include -#include -NGP_NAMESPACE_BEGIN +#include -using precision_t = tcnn::network_precision_t; +namespace ngp { // The maximum depth that can be produced when rendering a frame. @@ -32,51 +31,11 @@ using precision_t = tcnn::network_precision_t; // even when rendering the infinitely distant horizon. inline constexpr __device__ float MAX_DEPTH() { return 16384.0f; } -template -class Buffer2D { -public: - Buffer2D() = default; - Buffer2D(const ivec2& resolution) { - resize(resolution); - } - - T* data() const { - return m_data.data(); - } - - size_t bytes() const { - return m_data.bytes(); - } - - void resize(const ivec2& resolution) { - m_data.resize(compMul(resolution)); - m_resolution = resolution; - } - - const ivec2& resolution() const { - return m_resolution; - } - - Buffer2DView view() const { - // Row major for now. - return {data(), m_resolution}; - } - - Buffer2DView const_view() const { - // Row major for now. - return {data(), m_resolution}; - } - -private: - tcnn::GPUMemory m_data; - ivec2 m_resolution; -}; - inline NGP_HOST_DEVICE float srgb_to_linear(float srgb) { if (srgb <= 0.04045f) { return srgb / 12.92f; } else { - return std::pow((srgb + 0.055f) / 1.055f, 2.4f); + return pow((srgb + 0.055f) / 1.055f, 2.4f); } } @@ -88,7 +47,7 @@ inline NGP_HOST_DEVICE float srgb_to_linear_derivative(float srgb) { if (srgb <= 0.04045f) { return 1.0f / 12.92f; } else { - return 2.4f / 1.055f * std::pow((srgb + 0.055f) / 1.055f, 1.4f); + return 2.4f / 1.055f * pow((srgb + 0.055f) / 1.055f, 1.4f); } } @@ -100,7 +59,7 @@ inline NGP_HOST_DEVICE float linear_to_srgb(float linear) { if (linear < 0.0031308f) { return 12.92f * linear; } else { - return 1.055f * std::pow(linear, 0.41666f) - 0.055f; + return 1.055f * pow(linear, 0.41666f) - 0.055f; } } @@ -112,7 +71,7 @@ inline NGP_HOST_DEVICE float linear_to_srgb_derivative(float linear) { if (linear < 0.0031308f) { return 12.92f; } else { - return 1.055f * 0.41666f * std::pow(linear, 0.41666f - 1.0f); + return 1.055f * 0.41666f * pow(linear, 0.41666f - 1.0f); } } @@ -130,8 +89,8 @@ __device__ void deposit_image_gradient(const vec2& value, T* __restrict__ gradie constexpr uint32_t N_DIMS = 2; auto deposit_val = [&](const vec2& value, T weight, ivec2 pos) { - pos.x = std::max(std::min(pos.x, resolution.x-1), 0); - pos.y = std::max(std::min(pos.y, resolution.y-1), 0); + pos.x = max(min(pos.x, resolution.x-1), 0); + pos.y = max(min(pos.y, resolution.y-1), 0); #if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above if (std::is_same::value) { @@ -157,7 +116,7 @@ __device__ void deposit_image_gradient(const vec2& value, T* __restrict__ gradie struct FoveationPiecewiseQuadratic { FoveationPiecewiseQuadratic() = default; - FoveationPiecewiseQuadratic(float center_pixel_steepness, float center_inverse_piecewise_y, float center_radius) { + NGP_HOST_DEVICE FoveationPiecewiseQuadratic(float center_pixel_steepness, float center_inverse_piecewise_y, float center_radius) { float center_inverse_radius = center_radius * center_pixel_steepness; float left_inverse_piecewise_switch = center_inverse_piecewise_y - center_inverse_radius; float right_inverse_piecewise_switch = center_inverse_piecewise_y + center_inverse_radius; @@ -232,7 +191,7 @@ struct FoveationPiecewiseQuadratic { float inv_switch_left = 0.0f, inv_switch_right = 1.0f; NGP_HOST_DEVICE float warp(float x) const { - x = tcnn::clamp(x, 0.0f, 1.0f); + x = clamp(x, 0.0f, 1.0f); if (x < switch_left) { return al * x * x + bl * x + cl; } else if (x > switch_right) { @@ -243,18 +202,18 @@ struct FoveationPiecewiseQuadratic { } NGP_HOST_DEVICE float unwarp(float y) const { - y = tcnn::clamp(y, 0.0f, 1.0f); + y = clamp(y, 0.0f, 1.0f); if (y < inv_switch_left) { - return (std::sqrt(-4 * al * cl + 4 * al * y + bl * bl) - bl) / (2 * al); + return (sqrt(-4 * al * cl + 4 * al * y + bl * bl) - bl) / (2 * al); } else if (y > inv_switch_right) { - return (std::sqrt(-4 * ar * cr + 4 * ar * y + br * br) - br) / (2 * ar); + return (sqrt(-4 * ar * cr + 4 * ar * y + br * br) - br) / (2 * ar); } else { return (y - bm) / am; } } NGP_HOST_DEVICE float density(float x) const { - x = tcnn::clamp(x, 0.0f, 1.0f); + x = clamp(x, 0.0f, 1.0f); if (x < switch_left) { return 2 * al * x + bl; } else if (x > switch_right) { @@ -268,7 +227,7 @@ struct FoveationPiecewiseQuadratic { struct Foveation { Foveation() = default; - Foveation(const vec2& center_pixel_steepness, const vec2& center_inverse_piecewise_y, const vec2& center_radius) + NGP_HOST_DEVICE Foveation(const vec2& center_pixel_steepness, const vec2& center_inverse_piecewise_y, const vec2& center_radius) : warp_x{center_pixel_steepness.x, center_inverse_piecewise_y.x, center_radius.x}, warp_y{center_pixel_steepness.y, center_inverse_piecewise_y.y, center_radius.y} {} FoveationPiecewiseQuadratic warp_x, warp_y; @@ -309,10 +268,10 @@ NGP_HOST_DEVICE inline void opencv_fisheye_lens_distortion_delta(const T* extra_ const T k3 = extra_params[2]; const T k4 = extra_params[3]; - const T r = std::sqrt(u * u + v * v); + const T r = sqrt(u * u + v * v); - if (r > T(std::numeric_limits::epsilon())) { - const T theta = std::atan(r); + if (r > (T)std::numeric_limits::epsilon()) { + const T theta = atan(r); const T theta2 = theta * theta; const T theta4 = theta2 * theta2; const T theta6 = theta4 * theta2; @@ -337,8 +296,8 @@ NGP_HOST_DEVICE inline void iterative_lens_undistortion(const T* params, T* u, T const float kRelStepSize = 1e-6f; mat2 J; - const vec2 x0(*u, *v); - vec2 x(*u, *v); + const vec2 x0{*u, *v}; + vec2 x{*u, *v}; vec2 dx; vec2 dx_0b; vec2 dx_0f; @@ -346,8 +305,8 @@ NGP_HOST_DEVICE inline void iterative_lens_undistortion(const T* params, T* u, T vec2 dx_1f; for (uint32_t i = 0; i < kNumIterations; ++i) { - const float step0 = std::max(std::numeric_limits::epsilon(), std::abs(kRelStepSize * x[0])); - const float step1 = std::max(std::numeric_limits::epsilon(), std::abs(kRelStepSize * x[1])); + const float step0 = max(std::numeric_limits::epsilon(), abs(kRelStepSize * x[0])); + const float step1 = max(std::numeric_limits::epsilon(), abs(kRelStepSize * x[1])); distortion_fun(params, x[0], x[1], &dx[0], &dx[1]); distortion_fun(params, x[0] - step0, x[1], &dx_0b[0], &dx_0b[1]); distortion_fun(params, x[0] + step0, x[1], &dx_0f[0], &dx_0f[1]); @@ -402,7 +361,7 @@ inline NGP_HOST_DEVICE mat4x3 get_xform_given_rolling_shutter(const TrainingXFor float pixel_t = rolling_shutter.x + rolling_shutter.y * uv.x + rolling_shutter.z * uv.y + rolling_shutter.w * motionblur_time; vec3 pos = training_xform.start[3] + (training_xform.end[3] - training_xform.start[3]) * pixel_t; - mat3 rot = toMat3(normalize(slerp(fquat(mat3(training_xform.start)), fquat(mat3(training_xform.end)), pixel_t))); + mat3 rot = to_mat3(normalize(slerp(quat(mat3(training_xform.start)), quat(mat3(training_xform.end)), pixel_t))); return mat4x3(rot[0], rot[1], rot[2], pos); } @@ -433,7 +392,7 @@ inline NGP_HOST_DEVICE vec3 latlong_to_dir(const vec2& uv) { inline NGP_HOST_DEVICE vec3 equirectangular_to_dir(const vec2& uv) { float ct = (uv.y - 0.5f) * 2.0f; - float st = std::sqrt(std::max(1.0f - ct * ct, 0.0f)); + float st = sqrt(max(1.0f - ct * ct, 0.0f)); float phi = (uv.x - 0.5f) * PI() * 2.0f; float sp, cp; sincosf(phi, &sp, &cp); @@ -489,7 +448,7 @@ inline NGP_HOST_DEVICE Ray uv_to_ray( } if (distortion) { - dir.xy += distortion.at_lerp(warped_uv); + dir.xy() += distortion.at_lerp(warped_uv); } vec3 head_pos = {parallax_shift.x, parallax_shift.y, 0.f}; @@ -500,7 +459,7 @@ inline NGP_HOST_DEVICE Ray uv_to_ray( if (aperture_size != 0.0f) { vec3 lookat = origin + dir * focus_z; auto px = ivec2(uv * vec2(resolution)); - vec2 blur = aperture_size * square2disk_shirley(ld_random_val_2d(spp, px.x * 19349663 + px.y * 96925573) * 2.0f - vec2(1.0f)); + vec2 blur = aperture_size * square2disk_shirley(ld_random_val_2d(spp, px.x * 19349663 + px.y * 96925573) * 2.0f - 1.0f); origin += mat2x3(camera_matrix) * blur; dir = (lookat - origin) / focus_z; } @@ -576,7 +535,7 @@ inline NGP_HOST_DEVICE vec2 pos_to_uv( dir.x += du; dir.y += dv; - vec2 uv = dir.xy * focal_length / vec2(resolution) + screen_center; + vec2 uv = dir.xy() * focal_length / vec2(resolution) + screen_center; return foveation.unwarp(uv); } @@ -648,11 +607,11 @@ inline NGP_HOST_DEVICE vec2 motion_vector( // and VR reprojection. inline NGP_HOST_DEVICE float to_ndc_depth(float z, float n, float f) { // View depth outside of the view frustum leads to output outside of [0, 1] - z = tcnn::clamp(z, n, f); + z = clamp(z, n, f); float scale = n / (n - f); float bias = -f * scale; - return tcnn::clamp((z * scale + bias) / z, 0.0f, 1.0f); + return clamp((z * scale + bias) / z, 0.0f, 1.0f); } inline NGP_HOST_DEVICE float fov_to_focal_length(int resolution, float degrees) { @@ -695,54 +654,13 @@ inline NGP_HOST_DEVICE vec2 to_vec2(const float2& x) { return {x.x, x.y}; } -inline NGP_HOST_DEVICE vec3 rotvec(const mat3& mat) { - quat tmp = mat; - return axis(tmp) * angle(tmp); +inline NGP_HOST_DEVICE mat4x3 camera_log_lerp(const mat4x3& a, const mat4x3& b, float t) { + return mat_exp(mat_log(mat4(b) * inverse(mat4(a))) * t) * mat4(a); } -inline NGP_HOST_DEVICE mat3 rotmat(float angle, const vec3& axis) { - float s, c; - sincosf(angle, &s, &c); - float oc = 1.0f - c; - - return mat3( - oc * axis.x * axis.x + c, oc * axis.x * axis.y + axis.z * s, oc * axis.z * axis.x - axis.y * s, - oc * axis.x * axis.y - axis.z * s, oc * axis.y * axis.y + c, oc * axis.y * axis.z + axis.x * s, - oc * axis.z * axis.x + axis.y * s, oc * axis.y * axis.z - axis.x * s, oc * axis.z * axis.z + c - ); -} - -inline NGP_HOST_DEVICE mat3 rotmat(const vec3& vec) { - float angle = length(vec); - if (angle == 0.0f) { - return mat3(1.0f); - } - - return rotmat(angle, vec / angle); -} - -inline NGP_HOST_DEVICE mat3 slerp(const mat3& a, const mat3& b, float t) { - return toMat3(slerp(quat(a), quat(b), t)); -} - -inline NGP_HOST_DEVICE float norm(const mat4x3& mat) { - return sqrt(length2(mat[0]) + length2(mat[1]) + length2(mat[2]) + length2(mat[3])); -} - -inline NGP_HOST_DEVICE bool isfinite(float v) { - return std::isfinite(v); -} - -inline NGP_HOST_DEVICE bvec2 isfinite(const vec2& v) { - return bvec2(std::isfinite(v.x), std::isfinite(v.y)); -} - -inline NGP_HOST_DEVICE bvec3 isfinite(const vec3& v) { - return bvec3(std::isfinite(v.x), std::isfinite(v.y), std::isfinite(v.z)); -} - -inline NGP_HOST_DEVICE bvec4 isfinite(const vec4& v) { - return bvec4(std::isfinite(v.x), std::isfinite(v.y), std::isfinite(v.z), std::isfinite(v.w)); +inline NGP_HOST_DEVICE mat4x3 camera_slerp(const mat4x3& a, const mat4x3& b, float t) { + mat3 rot = slerp(mat3(a), mat3(b), t); + return {rot[0], rot[1], rot[2], mix(a[3], b[3], t)}; } inline NGP_HOST_DEVICE void apply_quilting(uint32_t* x, uint32_t* y, const ivec2& resolution, vec3& parallax_shift, const ivec2& quilting_dims) { @@ -784,7 +702,7 @@ __global__ void from_rgba32(const uint64_t num_pixels, const uint8_t* __restrict alpha = 0.f; } - tcnn::vector_t rgba_out; + tvec rgba_out; rgba_out[0] = (T)(srgb_to_linear(rgba[0] * (1.0f/255.0f)) * alpha); rgba_out[1] = (T)(srgb_to_linear(rgba[1] * (1.0f/255.0f)) * alpha); rgba_out[2] = (T)(srgb_to_linear(rgba[2] * (1.0f/255.0f)) * alpha); @@ -794,10 +712,9 @@ __global__ void from_rgba32(const uint64_t num_pixels, const uint8_t* __restrict rgba_out[0] = rgba_out[1] = rgba_out[2] = rgba_out[3] = (T)-1.0f; } - *((tcnn::vector_t*)&out[i*4]) = rgba_out; + *((tvec*)&out[i*4]) = rgba_out; } - // Foley & van Dam p593 / http://en.wikipedia.org/wiki/HSL_and_HSV inline NGP_HOST_DEVICE vec3 hsv_to_rgb(const vec3& hsv) { float h = hsv.x, s = hsv.y, v = hsv.z; @@ -839,24 +756,24 @@ enum class EDepthDataType { }; inline NGP_HOST_DEVICE ivec2 image_pos(const vec2& pos, const ivec2& resolution) { - return clamp(ivec2(pos * vec2(resolution)), ivec2(0), resolution - ivec2(1)); + return clamp(ivec2(pos * vec2(resolution)), 0, resolution - 1); } -inline NGP_HOST_DEVICE uint64_t pixel_idx(const ivec2& pos, const ivec2& resolution, uint32_t img) { - return pos.x + pos.y * resolution.x + img * (uint64_t)resolution.x * resolution.y; +inline NGP_HOST_DEVICE uint64_t pixel_idx(const ivec2& px, const ivec2& resolution, uint32_t img) { + return px.x + px.y * resolution.x + img * (uint64_t)resolution.x * resolution.y; } -inline NGP_HOST_DEVICE uint64_t pixel_idx(const vec2& xy, const ivec2& resolution, uint32_t img) { - return pixel_idx(image_pos(xy, resolution), resolution, img); +inline NGP_HOST_DEVICE uint64_t pixel_idx(const vec2& uv, const ivec2& resolution, uint32_t img) { + return pixel_idx(image_pos(uv, resolution), resolution, img); } // inline NGP_HOST_DEVICE vec3 composit_and_lerp(vec2 pos, const ivec2& resolution, uint32_t img, const __half* training_images, const vec3& background_color, const vec3& exposure_scale = vec3(1.0f)) { -// pos = (pos.cwiseProduct(vec2(resolution)) - vec2(0.5f)).cwiseMax(0.0f).cwiseMin(vec2(resolution) - vec2(1.0f + 1e-4f)); +// pos = (pos.cwiseProduct(vec2(resolution)) - 0.5f).cwiseMax(0.0f).cwiseMin(vec2(resolution) - (1.0f + 1e-4f)); // const ivec2 pos_int = pos.cast(); // const vec2 weight = pos - pos_int.cast(); -// const ivec2 idx = pos_int.cwiseMin(resolution - ivec2(2)).cwiseMax(0); +// const ivec2 idx = pos_int.cwiseMin(resolution - 2).cwiseMax(0); // auto read_val = [&](const ivec2& p) { // __half val[4]; @@ -905,7 +822,7 @@ inline NGP_HOST_DEVICE vec4 read_rgba(ivec2 px, const ivec2& resolution, const v case EImageDataType::Half: { __half val[4]; *(uint64_t*)&val[0] = ((uint64_t*)pixels)[pixel_idx(px, resolution, img)]; - return vec4{val[0], val[1], val[2], val[3]}; + return vec4{(float)val[0], (float)val[1], (float)val[2], (float)val[3]}; } case EImageDataType::Float: return ((vec4*)pixels)[pixel_idx(px, resolution, img)]; @@ -924,10 +841,13 @@ inline NGP_HOST_DEVICE float read_depth(vec2 pos, const ivec2& resolution, const return read_val(image_pos(pos, resolution)); } -mat4x3 camera_log_lerp(const mat4x3& begin, const mat4x3& end, float t); -mat4x3 camera_slerp(const mat4x3& begin, const mat4x3& end, float t); +inline __device__ int float_to_ordered_int(float f) { + int i = __float_as_int(f); + return (i >= 0 ) ? i : i ^ 0x7FFFFFFF; +} -tcnn::GPUMemory load_exr_gpu(const fs::path& path, int* width, int* height); -tcnn::GPUMemory load_stbi_gpu(const fs::path& path, int* width, int* height); +inline __device__ float ordered_int_to_float(int i) { + return __int_as_float(i >= 0 ? i : i ^ 0x7FFFFFFF); +} -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/common_host.h b/include/neural-graphics-primitives/common_host.h new file mode 100644 index 000000000..8769a6beb --- /dev/null +++ b/include/neural-graphics-primitives/common_host.h @@ -0,0 +1,170 @@ +/* +* Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +* +* NVIDIA CORPORATION and its licensors retain all intellectual property +* and proprietary rights in and to this software, related documentation +* and any modifications thereto. Any use, reproduction, disclosure or +* distribution of this software and related documentation without an express +* license agreement from NVIDIA CORPORATION is strictly prohibited. +*/ + +/** @file common_host.h + * @author Thomas Müller, NVIDIA + * @brief Shared functionality among multiple neural-graphics-primitives components. + */ + +#pragma once + +#include + +#include + +#include + +#include + +#include +#include + +namespace ngp { + +namespace fs = filesystem; + +bool is_wsl(); + +fs::path discover_executable_dir(); +fs::path discover_root_dir(); + +#ifdef _WIN32 +std::string utf16_to_utf8(const std::wstring& utf16); +std::wstring utf8_to_utf16(const std::string& utf16); +std::wstring native_string(const fs::path& path); +#else +std::string native_string(const fs::path& path); +#endif + +bool ends_with(const std::string& str, const std::string& ending); +bool ends_with_case_insensitive(const std::string& str, const std::string& ending); + +ETestbedMode mode_from_scene(const std::string& scene); +ETestbedMode mode_from_string(const std::string& str); +std::string to_string(ETestbedMode); + +inline std::string replace_all(std::string str, const std::string& a, const std::string& b) { + std::string::size_type n = 0; + while ((n = str.find(a, n)) != std::string::npos) { + str.replace(n, a.length(), b); + n += b.length(); + } + return str; +} + +enum class EEmaType { + Time, + Step, +}; + +class Ema { +public: + Ema(EEmaType type, float half_life) + : m_type{type}, m_decay{std::pow(0.5f, 1.0f / half_life)}, m_creation_time{std::chrono::steady_clock::now()} {} + + int64_t current_progress() { + if (m_type == EEmaType::Time) { + auto now = std::chrono::steady_clock::now(); + return std::chrono::duration_cast(now - m_creation_time).count(); + } else { + return m_last_progress + 1; + } + } + + void update(float val) { + int64_t cur = current_progress(); + int64_t elapsed = cur - m_last_progress; + m_last_progress = cur; + + float decay = std::pow(m_decay, elapsed); + m_val = val; + m_ema_val = decay * m_ema_val + (1.0f - decay) * val; + } + + void set(float val) { + m_last_progress = current_progress(); + m_val = m_ema_val = val; + } + + float val() const { + return m_val; + } + + float ema_val() const { + return m_ema_val; + } + +private: + float m_val = 0.0f; + float m_ema_val = 0.0f; + EEmaType m_type; + float m_decay; + + int64_t m_last_progress = 0; + std::chrono::time_point m_creation_time; +}; + +uint8_t* load_stbi(const fs::path& path, int* width, int* height, int* comp, int req_comp); +float* load_stbi_float(const fs::path& path, int* width, int* height, int* comp, int req_comp); +uint16_t* load_stbi_16(const fs::path& path, int* width, int* height, int* comp, int req_comp); +bool is_hdr_stbi(const fs::path& path); +int write_stbi(const fs::path& path, int width, int height, int comp, const uint8_t* pixels, int quality = 100); + +FILE* native_fopen(const fs::path& path, const char* mode); + +GPUMemory load_exr_gpu(const fs::path& path, int* width, int* height); +GPUMemory load_stbi_gpu(const fs::path& path, int* width, int* height); + +template +class Buffer2D { +public: + Buffer2D() = default; + Buffer2D(const ivec2& resolution) { + resize(resolution); + } + + T* data() const { + return m_data.data(); + } + + size_t bytes() const { + return m_data.bytes(); + } + + void resize(const ivec2& resolution) { + m_data.resize(product(resolution)); + m_resolution = resolution; + } + + const ivec2& resolution() const { + return m_resolution; + } + + Buffer2DView view() const { + // Row major for now. + return {data(), m_resolution}; + } + + Buffer2DView const_view() const { + // Row major for now. + return {data(), m_resolution}; + } + +private: + GPUMemory m_data; + ivec2 m_resolution; +}; + +struct BoundingBox; +struct Triangle; +std::ostream& operator<<(std::ostream& os, const BoundingBox& triangle); +std::ostream& operator<<(std::ostream& os, const Triangle& triangle); + +} diff --git a/include/neural-graphics-primitives/discrete_distribution.h b/include/neural-graphics-primitives/discrete_distribution.h index 32fda8b40..0e740ae53 100644 --- a/include/neural-graphics-primitives/discrete_distribution.h +++ b/include/neural-graphics-primitives/discrete_distribution.h @@ -14,7 +14,9 @@ #pragma once -NGP_NAMESPACE_BEGIN +#include + +namespace ngp { struct DiscreteDistribution { void build(std::vector weights) { @@ -43,4 +45,4 @@ struct DiscreteDistribution { std::vector cdf; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/dlss.h b/include/neural-graphics-primitives/dlss.h index 7d10fc23b..39c971b7f 100644 --- a/include/neural-graphics-primitives/dlss.h +++ b/include/neural-graphics-primitives/dlss.h @@ -18,7 +18,7 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { class IDlss { public: @@ -64,4 +64,4 @@ class IDlssProvider { std::shared_ptr init_vulkan_and_ngx(); #endif -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/envmap.cuh b/include/neural-graphics-primitives/envmap.cuh index daa5ad9ef..c2ae325f8 100644 --- a/include/neural-graphics-primitives/envmap.cuh +++ b/include/neural-graphics-primitives/envmap.cuh @@ -15,16 +15,11 @@ #pragma once -#include #include #include -#include -#include -#include - -NGP_NAMESPACE_BEGIN +namespace ngp { inline __device__ vec4 read_envmap(const Buffer2DView& envmap, const vec3& dir) { auto dir_cyl = dir_to_spherical_unorm({dir.z, -dir.x, dir.y}); @@ -40,7 +35,7 @@ inline __device__ vec4 read_envmap(const Buffer2DView& envmap, const } else if (pos.x >= envmap.resolution.x) { pos.x -= envmap.resolution.x; } - pos.y = std::max(std::min(pos.y, envmap.resolution.y-1), 0); + pos.y = max(min(pos.y, envmap.resolution.y-1), 0); return envmap.at(pos); }; @@ -55,7 +50,7 @@ inline __device__ vec4 read_envmap(const Buffer2DView& envmap, const } template -__device__ void deposit_envmap_gradient(const tcnn::vector_t& value, GRAD_T* __restrict__ envmap_gradient, const ivec2 envmap_resolution, const vec3& dir) { +__device__ void deposit_envmap_gradient(const tvec& value, GRAD_T* __restrict__ envmap_gradient, const ivec2 envmap_resolution, const vec3& dir) { auto dir_cyl = dir_to_spherical_unorm({dir.z, -dir.x, dir.y}); auto envmap_float = vec2{dir_cyl.y * (envmap_resolution.x-1), dir_cyl.x * (envmap_resolution.y-1)}; @@ -63,7 +58,7 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t& value, GRAD_ auto weight = envmap_float - vec2(envmap_texel); - auto deposit_val = [&](const tcnn::vector_t& value, T weight, ivec2 pos) { + auto deposit_val = [&](const tvec& value, T weight, ivec2 pos) { if (pos.x < 0) { pos.x += envmap_resolution.x; } else if (pos.x >= envmap_resolution.x) { @@ -71,8 +66,6 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t& value, GRAD_ } pos.y = std::max(std::min(pos.y, envmap_resolution.y-1), 0); - vec4 result; - #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 // atomicAdd(__half2) is only supported with compute capability 60 and above if (std::is_same::value) { for (uint32_t c = 0; c < 4; c += 2) { @@ -93,4 +86,4 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t& value, GRAD_ deposit_val(value, (weight.x) * (weight.y), {envmap_texel.x+1, envmap_texel.y+1}); } -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/json_binding.h b/include/neural-graphics-primitives/json_binding.h index f55762327..01f55c24e 100644 --- a/include/neural-graphics-primitives/json_binding.h +++ b/include/neural-graphics-primitives/json_binding.h @@ -10,7 +10,7 @@ /** @file json_binding.h * @author Thomas Müller, NVIDIA - * @brief Conversion between eigen + * @brief Conversion between some ngp types and nlohmann::json. */ #pragma once @@ -18,136 +18,11 @@ #include #include -#include - -// Conversion between glm and json -namespace glm { - template - void to_json(nlohmann::json& j, const tmat3x3& mat) { - for (int row = 0; row < 3; ++row) { - nlohmann::json column = nlohmann::json::array(); - for (int col = 0; col < 3; ++col) { - column.push_back(mat[col][row]); - } - j.push_back(column); - } - } - - template - void from_json(const nlohmann::json& j, tmat3x3& mat) { - for (std::size_t row = 0; row < 3; ++row) { - const auto& jrow = j.at(row); - for (std::size_t col = 0; col < 3; ++col) { - const auto& value = jrow.at(col); - mat[col][row] = value.get(); - } - } - } - - template - void to_json(nlohmann::json& j, const tmat4x3& mat) { - for (int row = 0; row < 3; ++row) { - nlohmann::json column = nlohmann::json::array(); - for (int col = 0; col < 4; ++col) { - column.push_back(mat[col][row]); - } - j.push_back(column); - } - } - - template - void from_json(const nlohmann::json& j, tmat4x3& mat) { - for (std::size_t row = 0; row < 3; ++row) { - const auto& jrow = j.at(row); - for (std::size_t col = 0; col < 4; ++col) { - const auto& value = jrow.at(col); - mat[col][row] = value.get(); - } - } - } - - template - void to_json(nlohmann::json& j, const tmat4x4& mat) { - for (int row = 0; row < 4; ++row) { - nlohmann::json column = nlohmann::json::array(); - for (int col = 0; col < 4; ++col) { - column.push_back(mat[col][row]); - } - j.push_back(column); - } - } - - template - void from_json(const nlohmann::json& j, tmat4x4& mat) { - for (std::size_t row = 0; row < 4; ++row) { - const auto& jrow = j.at(row); - for (std::size_t col = 0; col < 4; ++col) { - const auto& value = jrow.at(col); - mat[col][row] = value.get(); - } - } - } +#include - template - void to_json(nlohmann::json& j, const tvec2& v) { - j.push_back(v.x); - j.push_back(v.y); - } - - template - void from_json(const nlohmann::json& j, tvec2& v) { - v.x = j.at(0).get(); - v.y = j.at(1).get(); - } - - template - void to_json(nlohmann::json& j, const tvec3& v) { - j.push_back(v.x); - j.push_back(v.y); - j.push_back(v.z); - } - - template - void from_json(const nlohmann::json& j, tvec3& v) { - v.x = j.at(0).get(); - v.y = j.at(1).get(); - v.z = j.at(2).get(); - } - - template - void to_json(nlohmann::json& j, const tvec4& v) { - j.push_back(v.x); - j.push_back(v.y); - j.push_back(v.z); - j.push_back(v.w); - } - - template - void from_json(const nlohmann::json& j, tvec4& v) { - v.x = j.at(0).get(); - v.y = j.at(1).get(); - v.z = j.at(2).get(); - v.w = j.at(3).get(); - } - - template - void to_json(nlohmann::json& j, const tquat& q) { - j.push_back(q.x); - j.push_back(q.y); - j.push_back(q.z); - j.push_back(q.w); - } - - template - void from_json(const nlohmann::json& j, tquat& q) { - q.x = j.at(0).get(); - q.y = j.at(1).get(); - q.z = j.at(2).get(); - q.w = j.at(3).get(); - } -} +#include -NGP_NAMESPACE_BEGIN +namespace ngp { inline void to_json(nlohmann::json& j, const BoundingBox& box) { j["min"] = box.min; @@ -287,7 +162,7 @@ inline void from_json(const nlohmann::json& j, NerfDataset& dataset) { } dataset.render_aabb = j.at("render_aabb"); - dataset.render_aabb_to_local = mat3(1.0f); + dataset.render_aabb_to_local = mat3::identity(); if (j.contains("render_aabb_to_local")) dataset.render_aabb_to_local = j.at("render_aabb_to_local"); dataset.up = j.at("up"); @@ -307,4 +182,4 @@ inline void from_json(const nlohmann::json& j, NerfDataset& dataset) { dataset.n_extra_learnable_dims = j.value("n_extra_learnable_dims", 0); } -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/marching_cubes.h b/include/neural-graphics-primitives/marching_cubes.h index 5d7ba5092..869b4f2a1 100644 --- a/include/neural-graphics-primitives/marching_cubes.h +++ b/include/neural-graphics-primitives/marching_cubes.h @@ -15,37 +15,38 @@ #pragma once #include +#include #include -NGP_NAMESPACE_BEGIN +namespace ngp { ivec3 get_marching_cubes_res(uint32_t res_1d, const BoundingBox& render_aabb); -void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 render_aabb_to_local, ivec3 res_3d, float thresh, const tcnn::GPUMemory& density, tcnn::GPUMemory& vert_out, tcnn::GPUMemory& indices_out); +void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 render_aabb_to_local, ivec3 res_3d, float thresh, const GPUMemory& density, GPUMemory& vert_out, GPUMemory& indices_out); // computes the average of the 1ring of all verts, as homogenous coordinates -void compute_mesh_1ring(const tcnn::GPUMemory& verts, const tcnn::GPUMemory& indices, tcnn::GPUMemory& output_pos, tcnn::GPUMemory& output_normals); +void compute_mesh_1ring(const GPUMemory& verts, const GPUMemory& indices, GPUMemory& output_pos, GPUMemory& output_normals); void compute_mesh_opt_gradients( float thresh, - const tcnn::GPUMemory& verts, - const tcnn::GPUMemory& vert_normals, - const tcnn::GPUMemory& verts_smoothed, - const tcnn::network_precision_t* densities, + const GPUMemory& verts, + const GPUMemory& vert_normals, + const GPUMemory& verts_smoothed, + const network_precision_t* densities, uint32_t input_gradient_width, const float* input_gradients, - tcnn::GPUMemory& verts_gradient_out, + GPUMemory& verts_gradient_out, float k_smooth_amount, float k_density_amount, float k_inflate_amount ); void save_mesh( - tcnn::GPUMemory& verts, - tcnn::GPUMemory& normals, - tcnn::GPUMemory& colors, - tcnn::GPUMemory& indices, + GPUMemory& verts, + GPUMemory& normals, + GPUMemory& colors, + GPUMemory& indices, const fs::path& path, bool unwrap_it, float nerf_scale, @@ -54,10 +55,10 @@ void save_mesh( #ifdef NGP_GUI void draw_mesh_gl( - const tcnn::GPUMemory& verts, - const tcnn::GPUMemory& normals, - const tcnn::GPUMemory& cols, - const tcnn::GPUMemory& indices, + const GPUMemory& verts, + const GPUMemory& normals, + const GPUMemory& cols, + const GPUMemory& indices, const ivec2& resolution, const vec2& focal_length, const mat4x3& camera_matrix, @@ -70,8 +71,8 @@ uint32_t compile_shader(bool pixel, const char* code); bool check_shader(uint32_t handle, const char* desc, bool program); #endif -void save_density_grid_to_png(const tcnn::GPUMemory& density, const fs::path& path, ivec3 res3d, float thresh, bool swap_y_z = true, float density_range = 4.f); -void save_rgba_grid_to_png_sequence(const tcnn::GPUMemory& rgba, const fs::path& path, ivec3 res3d, bool swap_y_z = true); -void save_rgba_grid_to_raw_file(const tcnn::GPUMemory& rgba, const fs::path& path, ivec3 res3d, bool swap_y_z, int cascade); +void save_density_grid_to_png(const GPUMemory& density, const fs::path& path, ivec3 res3d, float thresh, bool swap_y_z = true, float density_range = 4.f); +void save_rgba_grid_to_png_sequence(const GPUMemory& rgba, const fs::path& path, ivec3 res3d, bool swap_y_z = true); +void save_rgba_grid_to_raw_file(const GPUMemory& rgba, const fs::path& path, ivec3 res3d, bool swap_y_z, int cascade); -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/nerf.h b/include/neural-graphics-primitives/nerf.h index 26f826a3a..c8d5d3780 100644 --- a/include/neural-graphics-primitives/nerf.h +++ b/include/neural-graphics-primitives/nerf.h @@ -15,32 +15,12 @@ #pragma once #include +#include -#include - -NGP_NAMESPACE_BEGIN - -// size of the density/occupancy grid in number of cells along an axis. -inline constexpr __device__ uint32_t NERF_GRIDSIZE() { - return 128; -} - -inline constexpr __device__ uint32_t NERF_GRID_N_CELLS() { - return NERF_GRIDSIZE() * NERF_GRIDSIZE() * NERF_GRIDSIZE(); -} - -struct NerfPayload { - vec3 origin; - vec3 dir; - float t; - float max_weight; - uint32_t idx; - uint16_t n_steps; - bool alive; -}; +namespace ngp { struct RaysNerfSoa { -#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__)) +#if defined(__CUDACC__) || (defined(__clang__) && defined(__CUDA__)) void copy_from_other_async(const RaysNerfSoa& other, cudaStream_t stream) { CUDA_CHECK_THROW(cudaMemcpyAsync(rgba, other.rgba, size * sizeof(vec4), cudaMemcpyDeviceToDevice, stream)); CUDA_CHECK_THROW(cudaMemcpyAsync(depth, other.depth, size * sizeof(float), cudaMemcpyDeviceToDevice, stream)); @@ -61,53 +41,4 @@ struct RaysNerfSoa { size_t size; }; -//#define TRIPLANAR_COMPATIBLE_POSITIONS // if this is defined, then positions are stored as [x,y,z,x] so that it can be split as [x,y] [y,z] [z,x] by the input encoding - -struct NerfPosition { - NGP_HOST_DEVICE NerfPosition(const vec3& pos, float dt) - : - p{pos} -#ifdef TRIPLANAR_COMPATIBLE_POSITIONS - , x{pos.x} -#endif - {} - vec3 p; -#ifdef TRIPLANAR_COMPATIBLE_POSITIONS - float x; -#endif -}; - -struct NerfDirection { - NGP_HOST_DEVICE NerfDirection(const vec3& dir, float dt) : d{dir} {} - vec3 d; -}; - -struct NerfCoordinate { - NGP_HOST_DEVICE NerfCoordinate(const vec3& pos, const vec3& dir, float dt) : pos{pos, dt}, dt{dt}, dir{dir, dt} {} - NGP_HOST_DEVICE void set_with_optional_extra_dims(const vec3& pos, const vec3& dir, float dt, const float* extra_dims, uint32_t stride_in_bytes) { - this->dt = dt; - this->pos = NerfPosition(pos, dt); - this->dir = NerfDirection(dir, dt); - copy_extra_dims(extra_dims, stride_in_bytes); - } - inline NGP_HOST_DEVICE const float* get_extra_dims() const { return (const float*)(this + 1); } - inline NGP_HOST_DEVICE float* get_extra_dims() { return (float*)(this + 1); } - - NGP_HOST_DEVICE void copy(const NerfCoordinate& inp, uint32_t stride_in_bytes) { - *this = inp; - copy_extra_dims(inp.get_extra_dims(), stride_in_bytes); - } - NGP_HOST_DEVICE inline void copy_extra_dims(const float *extra_dims, uint32_t stride_in_bytes) { - if (stride_in_bytes >= sizeof(NerfCoordinate)) { - float* dst = get_extra_dims(); - const uint32_t n_extra = (stride_in_bytes - sizeof(NerfCoordinate)) / sizeof(float); - for (uint32_t i = 0; i < n_extra; ++i) dst[i] = extra_dims[i]; - } - } - - NerfPosition pos; - float dt; - NerfDirection dir; -}; - -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/nerf_device.cuh b/include/neural-graphics-primitives/nerf_device.cuh new file mode 100644 index 000000000..61cce7479 --- /dev/null +++ b/include/neural-graphics-primitives/nerf_device.cuh @@ -0,0 +1,617 @@ +/* + * Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved. + * + * NVIDIA CORPORATION and its licensors retain all intellectual property + * and proprietary rights in and to this software, related documentation + * and any modifications thereto. Any use, reproduction, disclosure or + * distribution of this software and related documentation without an express + * license agreement from NVIDIA CORPORATION is strictly prohibited. + */ + +/** @file nerf_device.cuh + * @author Thomas Müller & Alex Evans, NVIDIA + */ + +#pragma once + +#include + +#include + +namespace ngp { + +// size of the density/occupancy grid in number of cells along an axis. +inline constexpr __device__ uint32_t NERF_GRIDSIZE() { return 128; } +inline constexpr __device__ uint32_t NERF_GRID_N_CELLS() { return NERF_GRIDSIZE() * NERF_GRIDSIZE() * NERF_GRIDSIZE(); } + +inline constexpr __device__ float NERF_RENDERING_NEAR_DISTANCE() { return 0.05f; } +inline constexpr __device__ uint32_t NERF_STEPS() { return 1024; } // finest number of steps per unit length +inline constexpr __device__ uint32_t NERF_CASCADES() { return 8; } + +inline constexpr __device__ float SQRT3() { return 1.73205080757f; } +inline constexpr __device__ float STEPSIZE() { return (SQRT3() / NERF_STEPS()); } // for nerf raymarch +inline constexpr __device__ float MIN_CONE_STEPSIZE() { return STEPSIZE(); } +// Maximum step size is the width of the coarsest gridsize cell. +inline constexpr __device__ float MAX_CONE_STEPSIZE() { return STEPSIZE() * (1<<(NERF_CASCADES()-1)) * NERF_STEPS() / NERF_GRIDSIZE(); } + +// Used to index into the PRNG stream. Must be larger than the number of +// samples consumed by any given training ray. +inline constexpr __device__ uint32_t N_MAX_RANDOM_SAMPLES_PER_RAY() { return 16; } + +// Any alpha below this is considered "invisible" and is thus culled away. +inline constexpr __device__ float NERF_MIN_OPTICAL_THICKNESS() { return 0.01f; } + +struct TrainingImageMetadata { + // Camera intrinsics and additional data associated with a NeRF training image + // the memory to back the pixels and rays is held by GPUMemory objects in the NerfDataset and copied here. + const void* pixels = nullptr; + EImageDataType image_data_type = EImageDataType::Half; + + const float* depth = nullptr; + const Ray* rays = nullptr; + + Lens lens = {}; + ivec2 resolution = ivec2(0); + vec2 principal_point = vec2(0.5f); + vec2 focal_length = vec2(1000.f); + vec4 rolling_shutter = vec4(0.0f); + vec3 light_dir = vec3(0.f); // TODO: replace this with more generic float[] of task-specific metadata. +}; + +struct LossAndGradient { + vec3 loss; + vec3 gradient; + + NGP_HOST_DEVICE LossAndGradient operator*(float scalar) { + return {loss * scalar, gradient * scalar}; + } + + NGP_HOST_DEVICE LossAndGradient operator/(float scalar) { + return {loss / scalar, gradient / scalar}; + } +}; + +inline NGP_HOST_DEVICE LossAndGradient l2_loss(const vec3& target, const vec3& prediction) { + vec3 difference = prediction - target; + return { + difference * difference, + 2.0f * difference + }; +} + +inline NGP_HOST_DEVICE LossAndGradient relative_l2_loss(const vec3& target, const vec3& prediction) { + vec3 difference = prediction - target; + vec3 denom = prediction * prediction + 1e-2f; + return { + difference * difference / denom, + 2.0f * difference / denom + }; +} + +inline NGP_HOST_DEVICE LossAndGradient l1_loss(const vec3& target, const vec3& prediction) { + vec3 difference = prediction - target; + return { + abs(difference), + copysign(vec3(1.0f), difference), + }; +} + +inline NGP_HOST_DEVICE LossAndGradient huber_loss(const vec3& target, const vec3& prediction, float alpha = 1) { + vec3 difference = prediction - target; + vec3 abs_diff = abs(difference); + vec3 square = 0.5f/alpha * difference * difference; + return { + { + abs_diff.x > alpha ? (abs_diff.x - 0.5f * alpha) : square.x, + abs_diff.y > alpha ? (abs_diff.y - 0.5f * alpha) : square.y, + abs_diff.z > alpha ? (abs_diff.z - 0.5f * alpha) : square.z, + }, + { + abs_diff.x > alpha ? (difference.x > 0 ? 1.0f : -1.0f) : (difference.x / alpha), + abs_diff.y > alpha ? (difference.y > 0 ? 1.0f : -1.0f) : (difference.y / alpha), + abs_diff.z > alpha ? (difference.z > 0 ? 1.0f : -1.0f) : (difference.z / alpha), + }, + }; +} + +inline NGP_HOST_DEVICE LossAndGradient log_l1_loss(const vec3& target, const vec3& prediction) { + vec3 difference = prediction - target; + vec3 divisor = abs(difference) + 1.0f; + return { + log(divisor), + copysign(vec3(1.0f) / divisor, difference), + }; +} + +inline NGP_HOST_DEVICE LossAndGradient smape_loss(const vec3& target, const vec3& prediction) { + vec3 difference = prediction - target; + vec3 denom = 0.5f * (abs(prediction) + abs(target)) + 1e-2f; + return { + abs(difference) / denom, + copysign(vec3(1.0f) / denom, difference), + }; +} + +inline NGP_HOST_DEVICE LossAndGradient mape_loss(const vec3& target, const vec3& prediction) { + vec3 difference = prediction - target; + vec3 denom = abs(prediction) + 1e-2f; + return { + abs(difference) / denom, + copysign(vec3(1.0f) / denom, difference), + }; +} + +struct NerfPayload { + vec3 origin; + vec3 dir; + float t; + float max_weight; + uint32_t idx; + uint16_t n_steps; + bool alive; +}; + +//#define TRIPLANAR_COMPATIBLE_POSITIONS // if this is defined, then positions are stored as [x,y,z,x] so that it can be split as [x,y] [y,z] [z,x] by the input encoding + +struct NerfPosition { + NGP_HOST_DEVICE NerfPosition(const vec3& pos, float dt) + : + p{pos} +#ifdef TRIPLANAR_COMPATIBLE_POSITIONS + , x{pos.x} +#endif + {} + vec3 p; +#ifdef TRIPLANAR_COMPATIBLE_POSITIONS + float x; +#endif +}; + +struct NerfDirection { + NGP_HOST_DEVICE NerfDirection(const vec3& dir, float dt) : d{dir} {} + vec3 d; +}; + +struct NerfCoordinate { + NGP_HOST_DEVICE NerfCoordinate(const vec3& pos, const vec3& dir, float dt) : pos{pos, dt}, dt{dt}, dir{dir, dt} {} + NGP_HOST_DEVICE void set_with_optional_extra_dims(const vec3& pos, const vec3& dir, float dt, const float* extra_dims, uint32_t stride_in_bytes) { + this->dt = dt; + this->pos = NerfPosition(pos, dt); + this->dir = NerfDirection(dir, dt); + copy_extra_dims(extra_dims, stride_in_bytes); + } + inline NGP_HOST_DEVICE const float* get_extra_dims() const { return (const float*)(this + 1); } + inline NGP_HOST_DEVICE float* get_extra_dims() { return (float*)(this + 1); } + + NGP_HOST_DEVICE void copy(const NerfCoordinate& inp, uint32_t stride_in_bytes) { + *this = inp; + copy_extra_dims(inp.get_extra_dims(), stride_in_bytes); + } + NGP_HOST_DEVICE inline void copy_extra_dims(const float *extra_dims, uint32_t stride_in_bytes) { + if (stride_in_bytes >= sizeof(NerfCoordinate)) { + float* dst = get_extra_dims(); + const uint32_t n_extra = (stride_in_bytes - sizeof(NerfCoordinate)) / sizeof(float); + for (uint32_t i = 0; i < n_extra; ++i) dst[i] = extra_dims[i]; + } + } + + NerfPosition pos; + float dt; + NerfDirection dir; +}; + +inline NGP_HOST_DEVICE float network_to_rgb(float val, ENerfActivation activation) { + switch (activation) { + case ENerfActivation::None: return val; + case ENerfActivation::ReLU: return val > 0.0f ? val : 0.0f; + case ENerfActivation::Logistic: return logistic(val); + case ENerfActivation::Exponential: return expf(clamp(val, -10.0f, 10.0f)); + default: assert(false); + } + return 0.0f; +} + +inline NGP_HOST_DEVICE float network_to_rgb_derivative(float val, ENerfActivation activation) { + switch (activation) { + case ENerfActivation::None: return 1.0f; + case ENerfActivation::ReLU: return val > 0.0f ? 1.0f : 0.0f; + case ENerfActivation::Logistic: { float density = logistic(val); return density * (1 - density); }; + case ENerfActivation::Exponential: return expf(clamp(val, -10.0f, 10.0f)); + default: assert(false); + } + return 0.0f; +} + +template +NGP_HOST_DEVICE vec3 network_to_rgb_derivative_vec(const T& val, ENerfActivation activation) { + return { + network_to_rgb_derivative(float(val[0]), activation), + network_to_rgb_derivative(float(val[1]), activation), + network_to_rgb_derivative(float(val[2]), activation), + }; +} + +inline NGP_HOST_DEVICE float network_to_density(float val, ENerfActivation activation) { + switch (activation) { + case ENerfActivation::None: return val; + case ENerfActivation::ReLU: return val > 0.0f ? val : 0.0f; + case ENerfActivation::Logistic: return logistic(val); + case ENerfActivation::Exponential: return expf(val); + default: assert(false); + } + return 0.0f; +} + +inline NGP_HOST_DEVICE float network_to_density_derivative(float val, ENerfActivation activation) { + switch (activation) { + case ENerfActivation::None: return 1.0f; + case ENerfActivation::ReLU: return val > 0.0f ? 1.0f : 0.0f; + case ENerfActivation::Logistic: { float density = logistic(val); return density * (1 - density); }; + case ENerfActivation::Exponential: return expf(clamp(val, -15.0f, 15.0f)); + default: assert(false); + } + return 0.0f; +} + +template +NGP_HOST_DEVICE vec3 network_to_rgb_vec(const T& val, ENerfActivation activation) { + return { + network_to_rgb(float(val[0]), activation), + network_to_rgb(float(val[1]), activation), + network_to_rgb(float(val[2]), activation), + }; +} + +inline NGP_HOST_DEVICE vec3 warp_position(const vec3& pos, const BoundingBox& aabb) { + // return {logistic(pos.x - 0.5f), logistic(pos.y - 0.5f), logistic(pos.z - 0.5f)}; + // return pos; + + return aabb.relative_pos(pos); +} + +inline NGP_HOST_DEVICE vec3 unwarp_position(const vec3& pos, const BoundingBox& aabb) { + // return {logit(pos.x) + 0.5f, logit(pos.y) + 0.5f, logit(pos.z) + 0.5f}; + // return pos; + + return aabb.min + pos * aabb.diag(); +} + +inline NGP_HOST_DEVICE vec3 unwarp_position_derivative(const vec3& pos, const BoundingBox& aabb) { + // return {logit(pos.x) + 0.5f, logit(pos.y) + 0.5f, logit(pos.z) + 0.5f}; + // return pos; + + return aabb.diag(); +} + +inline NGP_HOST_DEVICE vec3 warp_position_derivative(const vec3& pos, const BoundingBox& aabb) { + return vec3(1.0f) / unwarp_position_derivative(pos, aabb); +} + +inline NGP_HOST_DEVICE vec3 warp_direction(const vec3& dir) { + return (dir + 1.0f) * 0.5f; +} + +inline NGP_HOST_DEVICE vec3 unwarp_direction(const vec3& dir) { + return dir * 2.0f - 1.0f; +} + +inline NGP_HOST_DEVICE vec3 warp_direction_derivative(const vec3& dir) { + return vec3(0.5f); +} + +inline NGP_HOST_DEVICE vec3 unwarp_direction_derivative(const vec3& dir) { + return vec3(2.0f); +} + +inline NGP_HOST_DEVICE float warp_dt(float dt) { + float max_stepsize = MIN_CONE_STEPSIZE() * (1<<(NERF_CASCADES()-1)); + return (dt - MIN_CONE_STEPSIZE()) / (max_stepsize - MIN_CONE_STEPSIZE()); +} + +inline NGP_HOST_DEVICE float unwarp_dt(float dt) { + float max_stepsize = MIN_CONE_STEPSIZE() * (1<<(NERF_CASCADES()-1)); + return dt * (max_stepsize - MIN_CONE_STEPSIZE()) + MIN_CONE_STEPSIZE(); +} + +inline NGP_HOST_DEVICE uint32_t cascaded_grid_idx_at(vec3 pos, uint32_t mip) { + float mip_scale = scalbnf(1.0f, -mip); + pos -= vec3(0.5f); + pos *= mip_scale; + pos += vec3(0.5f); + + ivec3 i = pos * (float)NERF_GRIDSIZE(); + if (i.x < 0 || i.x >= NERF_GRIDSIZE() || i.y < 0 || i.y >= NERF_GRIDSIZE() || i.z < 0 || i.z >= NERF_GRIDSIZE()) { + return 0xFFFFFFFF; + } + + return morton3D(i.x, i.y, i.z); +} + +inline NGP_HOST_DEVICE uint32_t grid_mip_offset(uint32_t mip) { + return NERF_GRID_N_CELLS() * mip; +} + +inline NGP_HOST_DEVICE bool density_grid_occupied_at(const vec3& pos, const uint8_t* density_grid_bitfield, uint32_t mip) { + uint32_t idx = cascaded_grid_idx_at(pos, mip); + if (idx == 0xFFFFFFFF) { + return false; + } + return density_grid_bitfield[idx/8+grid_mip_offset(mip)/8] & (1<<(idx%8)); +} + +inline NGP_HOST_DEVICE float cascaded_grid_at(vec3 pos, const float* cascaded_grid, uint32_t mip) { + uint32_t idx = cascaded_grid_idx_at(pos, mip); + if (idx == 0xFFFFFFFF) { + return 0.0f; + } + return cascaded_grid[idx+grid_mip_offset(mip)]; +} + +inline NGP_HOST_DEVICE float& cascaded_grid_at(vec3 pos, float* cascaded_grid, uint32_t mip) { + uint32_t idx = cascaded_grid_idx_at(pos, mip); + if (idx == 0xFFFFFFFF) { + idx = 0; + printf("WARNING: invalid cascaded grid access."); + } + return cascaded_grid[idx+grid_mip_offset(mip)]; +} + +inline NGP_HOST_DEVICE float distance_to_next_voxel(const vec3& pos, const vec3& dir, const vec3& idir, float res) { // dda like step + vec3 p = res * (pos - 0.5f); + float tx = (floorf(p.x + 0.5f + 0.5f * sign(dir.x)) - p.x) * idir.x; + float ty = (floorf(p.y + 0.5f + 0.5f * sign(dir.y)) - p.y) * idir.y; + float tz = (floorf(p.z + 0.5f + 0.5f * sign(dir.z)) - p.z) * idir.z; + float t = min(min(tx, ty), tz); + + return fmaxf(t / res, 0.0f); +} + +inline NGP_HOST_DEVICE float calc_cone_angle(float cosine, const vec2& focal_length, float cone_angle_constant) { + // Pixel size. Doesn't always yield a good performance vs. quality + // trade off. Especially if training pixels have a much different + // size than rendering pixels. + // return cosine*cosine / focal_length.mean(); + + return cone_angle_constant; +} + +inline NGP_HOST_DEVICE float to_stepping_space(float t, float cone_angle) { + if (cone_angle <= 1e-5f) { + return t / MIN_CONE_STEPSIZE(); + } + + float log1p_c = logf(1.0f + cone_angle); + + float a = (logf(MIN_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c; + float b = (logf(MAX_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c; + + float at = expf(a * log1p_c); + float bt = expf(b * log1p_c); + + if (t <= at) { + return (t - at) / MIN_CONE_STEPSIZE() + a; + } else if (t <= bt) { + return logf(t) / log1p_c; + } else { + return (t - bt) / MAX_CONE_STEPSIZE() + b; + } +} + +inline NGP_HOST_DEVICE float from_stepping_space(float n, float cone_angle) { + if (cone_angle <= 1e-5f) { + return n * MIN_CONE_STEPSIZE(); + } + + float log1p_c = logf(1.0f + cone_angle); + + float a = (logf(MIN_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c; + float b = (logf(MAX_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c; + + float at = expf(a * log1p_c); + float bt = expf(b * log1p_c); + + if (n <= a) { + return (n - a) * MIN_CONE_STEPSIZE() + at; + } else if (n <= b) { + return expf(n * log1p_c); + } else { + return (n - b) * MAX_CONE_STEPSIZE() + bt; + } +} + +inline NGP_HOST_DEVICE float advance_n_steps(float t, float cone_angle, float n) { + return from_stepping_space(to_stepping_space(t, cone_angle) + n, cone_angle); +} + +inline NGP_HOST_DEVICE float calc_dt(float t, float cone_angle) { + return advance_n_steps(t, cone_angle, 1.0f) - t; +} + +inline NGP_HOST_DEVICE float advance_to_next_voxel(float t, float cone_angle, const vec3& pos, const vec3& dir, const vec3& idir, uint32_t mip) { + float res = scalbnf(NERF_GRIDSIZE(), -(int)mip); + + float t_target = t + distance_to_next_voxel(pos, dir, idir, res); + + // Analytic stepping in multiples of 1 in the "log-space" of our exponential stepping routine + t = to_stepping_space(t, cone_angle); + t_target = to_stepping_space(t_target, cone_angle); + + return from_stepping_space(t + ceilf(fmaxf(t_target - t, 0.5f)), cone_angle); +} + +inline NGP_HOST_DEVICE uint32_t mip_from_pos(const vec3& pos, uint32_t max_cascade = NERF_CASCADES()-1) { + int exponent; + float maxval = max(abs(pos - 0.5f)); + frexpf(maxval, &exponent); + return (uint32_t)clamp(exponent+1, 0, (int)max_cascade); +} + +inline NGP_HOST_DEVICE uint32_t mip_from_dt(float dt, const vec3& pos, uint32_t max_cascade = NERF_CASCADES()-1) { + uint32_t mip = mip_from_pos(pos, max_cascade); + dt *= 2 * NERF_GRIDSIZE(); + if (dt < 1.0f) { + return mip; + } + + int exponent; + frexpf(dt, &exponent); + return (uint32_t)clamp((int)mip, exponent, (int)max_cascade); +} + +template +NGP_HOST_DEVICE float if_unoccupied_advance_to_next_occupied_voxel( + float t, + float cone_angle, + const Ray& ray, + const vec3& idir, + const uint8_t* __restrict__ density_grid, + uint32_t min_mip, + uint32_t max_mip, + BoundingBox aabb, + mat3 aabb_to_local = mat3::identity() +) { + while (true) { + vec3 pos = ray(t); + if (t >= MAX_DEPTH() || !aabb.contains(aabb_to_local * pos)) { + return MAX_DEPTH(); + } + + uint32_t mip = clamp(MIP_FROM_DT ? mip_from_dt(calc_dt(t, cone_angle), pos) : mip_from_pos(pos), min_mip, max_mip); + + if (!density_grid || density_grid_occupied_at(pos, density_grid, mip)) { + return t; + } + + // Find largest empty voxel surrounding us, such that we can advance as far as possible in the next step. + // Other places that do voxel stepping don't need this, because they don't rely on thread coherence as + // much as this one here. + while (mip < max_mip && !density_grid_occupied_at(pos, density_grid, mip+1)) { + ++mip; + } + + t = advance_to_next_voxel(t, cone_angle, pos, ray.d, idir, mip); + } +} + +static constexpr float UNIFORM_SAMPLING_FRACTION = 0.5f; + +inline NGP_HOST_DEVICE vec2 sample_cdf_2d(vec2 sample, uint32_t img, const ivec2& res, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y, float* __restrict__ pdf) { + if (sample.x < UNIFORM_SAMPLING_FRACTION) { + sample.x /= UNIFORM_SAMPLING_FRACTION; + return sample; + } + + sample.x = (sample.x - UNIFORM_SAMPLING_FRACTION) / (1.0f - UNIFORM_SAMPLING_FRACTION); + + cdf_y += img * res.y; + + // First select row according to cdf_y + uint32_t y = binary_search(sample.y, cdf_y, res.y); + float prev = y > 0 ? cdf_y[y-1] : 0.0f; + float pmf_y = cdf_y[y] - prev; + sample.y = (sample.y - prev) / pmf_y; + + cdf_x_cond_y += img * res.y * res.x + y * res.x; + + // Then, select col according to x + uint32_t x = binary_search(sample.x, cdf_x_cond_y, res.x); + prev = x > 0 ? cdf_x_cond_y[x-1] : 0.0f; + float pmf_x = cdf_x_cond_y[x] - prev; + sample.x = (sample.x - prev) / pmf_x; + + if (pdf) { + *pdf = pmf_x * pmf_y * product(res); + } + + return {((float)x + sample.x) / (float)res.x, ((float)y + sample.y) / (float)res.y}; +} + +inline NGP_HOST_DEVICE float pdf_2d(vec2 sample, uint32_t img, const ivec2& res, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y) { + ivec2 p = clamp(ivec2(sample * vec2(res)), 0, res - 1); + + cdf_y += img * res.y; + cdf_x_cond_y += img * res.y * res.x + p.y * res.x; + + float pmf_y = cdf_y[p.y]; + if (p.y > 0) { + pmf_y -= cdf_y[p.y-1]; + } + + float pmf_x = cdf_x_cond_y[p.x]; + if (p.x > 0) { + pmf_x -= cdf_x_cond_y[p.x-1]; + } + + // Probability mass of picking the pixel + float pmf = pmf_x * pmf_y; + + // To convert to probability density, divide by area of pixel + return UNIFORM_SAMPLING_FRACTION + pmf * product(res) * (1.0f - UNIFORM_SAMPLING_FRACTION); +} + +inline __device__ vec2 nerf_random_image_pos_training(default_rng_t& rng, const ivec2& resolution, bool snap_to_pixel_centers, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y, const ivec2& cdf_res, uint32_t img, float* __restrict__ pdf = nullptr) { + vec2 uv = random_val_2d(rng); + + if (cdf_x_cond_y) { + uv = sample_cdf_2d(uv, img, cdf_res, cdf_x_cond_y, cdf_y, pdf); + } else { + // // Warp-coherent tile + // uv.x = __shfl_sync(0xFFFFFFFF, uv.x, 0); + // uv.y = __shfl_sync(0xFFFFFFFF, uv.y, 0); + + // const ivec2 TILE_SIZE = {8, 4}; + // uv = (uv * vec2(resolution - TILE_SIZE) + vec2(tcnn::lane_id() % TILE_SIZE.x, tcnn::lane_id() / threadIdx.x)) / vec2(resolution); + + if (pdf) { + *pdf = 1.0f; + } + } + + if (snap_to_pixel_centers) { + uv = (vec2(clamp(ivec2(uv * vec2(resolution)), 0, resolution - 1)) + 0.5f) / vec2(resolution); + } + + return uv; +} + +inline NGP_HOST_DEVICE uint32_t image_idx(uint32_t base_idx, uint32_t n_rays, uint32_t n_rays_total, uint32_t n_training_images, const float* __restrict__ cdf = nullptr, float* __restrict__ pdf = nullptr) { + if (cdf) { + float sample = ld_random_val(base_idx/* + n_rays_total*/, 0xdeadbeef); + // float sample = random_val(base_idx/* + n_rays_total*/); + uint32_t img = binary_search(sample, cdf, n_training_images); + + if (pdf) { + float prev = img > 0 ? cdf[img-1] : 0.0f; + *pdf = (cdf[img] - prev) * n_training_images; + } + + return img; + } + + // return ((base_idx/* + n_rays_total*/) * 56924617 + 96925573) % n_training_images; + + // Neighboring threads in the warp process the same image. Increases locality. + if (pdf) { + *pdf = 1.0f; + } + return (((base_idx/* + n_rays_total*/) * n_training_images) / n_rays) % n_training_images; +} + +inline NGP_HOST_DEVICE LossAndGradient loss_and_gradient(const vec3& target, const vec3& prediction, ELossType loss_type) { + switch (loss_type) { + case ELossType::RelativeL2: return relative_l2_loss(target, prediction); break; + case ELossType::L1: return l1_loss(target, prediction); break; + case ELossType::Mape: return mape_loss(target, prediction); break; + case ELossType::Smape: return smape_loss(target, prediction); break; + // Note: we divide the huber loss by a factor of 5 such that its L2 region near zero + // matches with the L2 loss and error numbers become more comparable. This allows reading + // off dB numbers of ~converged models and treating them as approximate PSNR to compare + // with other NeRF methods. Self-normalizing optimizers such as Adam are agnostic to such + // constant factors; optimization is therefore unaffected. + case ELossType::Huber: return huber_loss(target, prediction, 0.1f) / 5.0f; break; + case ELossType::LogL1: return log_l1_loss(target, prediction); break; + default: case ELossType::L2: return l2_loss(target, prediction); break; + } +} + +} diff --git a/include/neural-graphics-primitives/nerf_loader.h b/include/neural-graphics-primitives/nerf_loader.h index 6456cc3f8..56f2f2f26 100644 --- a/include/neural-graphics-primitives/nerf_loader.h +++ b/include/neural-graphics-primitives/nerf_loader.h @@ -16,34 +16,18 @@ #pragma once #include -#include +#include +#include #include #include -NGP_NAMESPACE_BEGIN +namespace ngp { // how much to scale the scene by vs the original nerf dataset; we want to fit the thing in the unit cube static constexpr float NERF_SCALE = 0.33f; -struct TrainingImageMetadata { - // Camera intrinsics and additional data associated with a NeRF training image - // the memory to back the pixels and rays is held by GPUMemory objects in the NerfDataset and copied here. - const void* pixels = nullptr; - EImageDataType image_data_type = EImageDataType::Half; - - const float* depth = nullptr; - const Ray* rays = nullptr; - - Lens lens = {}; - ivec2 resolution = ivec2(0); - vec2 principal_point = vec2(0.5f); - vec2 focal_length = vec2(1000.f); - vec4 rolling_shutter = vec4(0.0f); - vec3 light_dir = vec3(0.f); // TODO: replace this with more generic float[] of task-specific metadata. -}; - inline size_t image_type_size(EImageDataType type) { switch (type) { case EImageDataType::None: return 0; @@ -67,23 +51,23 @@ struct NerfDataset { return xforms == other.xforms && paths == other.paths; } - std::vector> raymemory; - std::vector> pixelmemory; - std::vector> depthmemory; + std::vector> raymemory; + std::vector> pixelmemory; + std::vector> depthmemory; std::vector metadata; - tcnn::GPUMemory metadata_gpu; + GPUMemory metadata_gpu; void update_metadata(int first = 0, int last = -1); std::vector xforms; std::vector paths; - tcnn::GPUMemory sharpness_data; + GPUMemory sharpness_data; ivec2 sharpness_resolution = {0, 0}; - tcnn::GPUMemory envmap_data; + GPUMemory envmap_data; BoundingBox render_aabb = {}; - mat3 render_aabb_to_local = mat3(1.0f); + mat3 render_aabb_to_local = mat3::identity(); vec3 up = {0.0f, 1.0f, 0.0f}; vec3 offset = {0.0f, 0.0f, 0.0f}; size_t n_images = 0; @@ -107,9 +91,9 @@ struct NerfDataset { vec3 nerf_direction_to_ngp(const vec3& nerf_dir) { vec3 result = nerf_dir; if (from_mitsuba) { - result *= -1; + result *= -1.0f; } else { - result = vec3(result.y, result.z, result.x); + result = vec3{result.y, result.z, result.x}; } return result; } @@ -122,8 +106,8 @@ struct NerfDataset { result[3] = result[3] * scale + offset; if (from_mitsuba) { - result[0] *= -1; - result[2] *= -1; + result[0] *= -1.0f; + result[2] *= -1.0f; } else { // Cycle axes xyz<-yzx vec4 tmp = row(result, 0); @@ -138,8 +122,8 @@ struct NerfDataset { mat4x3 ngp_matrix_to_nerf(const mat4x3& ngp_matrix, bool scale_columns = false) const { mat4x3 result = ngp_matrix; if (from_mitsuba) { - result[0] *= -1; - result[2] *= -1; + result[0] *= -1.0f; + result[2] *= -1.0f; } else { // Cycle axes xyz->yzx vec4 tmp = row(result, 0); @@ -156,14 +140,14 @@ struct NerfDataset { vec3 ngp_position_to_nerf(vec3 pos) const { if (!from_mitsuba) { - pos = vec3(pos.z, pos.x, pos.y); + pos = vec3{pos.z, pos.x, pos.y}; } return (pos - offset) / scale; } vec3 nerf_position_to_ngp(const vec3 &pos) const { vec3 rv = pos * scale + offset; - return from_mitsuba ? rv : vec3(rv.y, rv.z, rv.x); + return from_mitsuba ? rv : vec3{rv.y, rv.z, rv.x}; } void nerf_ray_to_ngp(Ray& ray, bool scale_direction = false) { @@ -187,4 +171,4 @@ struct NerfDataset { NerfDataset load_nerf(const std::vector& jsonpaths, float sharpen_amount = 0.f); NerfDataset create_empty_nerf_dataset(size_t n_images, int aabb_scale = 1, bool is_hdr = false); -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/nerf_network.h b/include/neural-graphics-primitives/nerf_network.h index 55b513056..3dbf67321 100644 --- a/include/neural-graphics-primitives/nerf_network.h +++ b/include/neural-graphics-primitives/nerf_network.h @@ -26,7 +26,7 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { template __global__ void extract_density( @@ -74,39 +74,41 @@ __global__ void add_density_gradient( } template -class NerfNetwork : public tcnn::Network { +class NerfNetwork : public Network { public: using json = nlohmann::json; NerfNetwork(uint32_t n_pos_dims, uint32_t n_dir_dims, uint32_t n_extra_dims, uint32_t dir_offset, const json& pos_encoding, const json& dir_encoding, const json& density_network, const json& rgb_network) : m_n_pos_dims{n_pos_dims}, m_n_dir_dims{n_dir_dims}, m_dir_offset{dir_offset}, m_n_extra_dims{n_extra_dims} { - m_pos_encoding.reset(tcnn::create_encoding(n_pos_dims, pos_encoding, density_network.contains("otype") && (tcnn::equals_case_insensitive(density_network["otype"], "FullyFusedMLP") || tcnn::equals_case_insensitive(density_network["otype"], "MegakernelMLP")) ? 16u : 8u)); - uint32_t rgb_alignment = tcnn::minimum_alignment(rgb_network); - m_dir_encoding.reset(tcnn::create_encoding(m_n_dir_dims + m_n_extra_dims, dir_encoding, rgb_alignment)); + m_pos_encoding.reset(create_encoding(n_pos_dims, pos_encoding, density_network.contains("otype") && (equals_case_insensitive(density_network["otype"], "FullyFusedMLP") || equals_case_insensitive(density_network["otype"], "MegakernelMLP")) ? 16u : 8u)); + uint32_t rgb_alignment = minimum_alignment(rgb_network); + m_dir_encoding.reset(create_encoding(m_n_dir_dims + m_n_extra_dims, dir_encoding, rgb_alignment)); json local_density_network_config = density_network; local_density_network_config["n_input_dims"] = m_pos_encoding->padded_output_width(); if (!density_network.contains("n_output_dims")) { local_density_network_config["n_output_dims"] = 16; } - m_density_network.reset(tcnn::create_network(local_density_network_config)); + m_density_network.reset(create_network(local_density_network_config)); - m_rgb_network_input_width = tcnn::next_multiple(m_dir_encoding->padded_output_width() + m_density_network->padded_output_width(), rgb_alignment); + m_rgb_network_input_width = next_multiple(m_dir_encoding->padded_output_width() + m_density_network->padded_output_width(), rgb_alignment); json local_rgb_network_config = rgb_network; local_rgb_network_config["n_input_dims"] = m_rgb_network_input_width; local_rgb_network_config["n_output_dims"] = 3; - m_rgb_network.reset(tcnn::create_network(local_rgb_network_config)); + m_rgb_network.reset(create_network(local_rgb_network_config)); + + m_density_model = std::make_shared>(m_pos_encoding, m_density_network); } virtual ~NerfNetwork() { } - void inference_mixed_precision_impl(cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, tcnn::GPUMatrixDynamic& output, bool use_inference_params = true) override { + void inference_mixed_precision_impl(cudaStream_t stream, const GPUMatrixDynamic& input, GPUMatrixDynamic& output, bool use_inference_params = true) override { uint32_t batch_size = input.n(); - tcnn::GPUMatrixDynamic density_network_input{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; - tcnn::GPUMatrixDynamic rgb_network_input{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()}; + GPUMatrixDynamic density_network_input{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; + GPUMatrixDynamic rgb_network_input{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()}; - tcnn::GPUMatrixDynamic density_network_output = rgb_network_input.slice_rows(0, m_density_network->padded_output_width()); - tcnn::GPUMatrixDynamic rgb_network_output{output.data(), m_rgb_network->padded_output_width(), batch_size, output.layout()}; + GPUMatrixDynamic density_network_output = rgb_network_input.slice_rows(0, m_density_network->padded_output_width()); + GPUMatrixDynamic rgb_network_output{output.data(), m_rgb_network->padded_output_width(), batch_size, output.layout()}; m_pos_encoding->inference_mixed_precision( stream, @@ -127,12 +129,12 @@ class NerfNetwork : public tcnn::Network { m_rgb_network->inference_mixed_precision(stream, rgb_network_input, rgb_network_output, use_inference_params); - tcnn::linear_kernel(extract_density, 0, stream, + linear_kernel(extract_density, 0, stream, batch_size, - density_network_output.layout() == tcnn::AoS ? density_network_output.stride() : 1, - output.layout() == tcnn::AoS ? padded_output_width() : 1, + density_network_output.layout() == AoS ? density_network_output.stride() : 1, + output.layout() == AoS ? padded_output_width() : 1, density_network_output.data(), - output.data() + 3 * (output.layout() == tcnn::AoS ? 1 : batch_size) + output.data() + 3 * (output.layout() == AoS ? 1 : batch_size) ); } @@ -140,14 +142,14 @@ class NerfNetwork : public tcnn::Network { return m_density_network->padded_output_width(); } - std::unique_ptr forward_impl(cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, tcnn::GPUMatrixDynamic* output = nullptr, bool use_inference_params = false, bool prepare_input_gradients = false) override { + std::unique_ptr forward_impl(cudaStream_t stream, const GPUMatrixDynamic& input, GPUMatrixDynamic* output = nullptr, bool use_inference_params = false, bool prepare_input_gradients = false) override { // Make sure our temporary buffers have the correct size for the given batch size uint32_t batch_size = input.n(); auto forward = std::make_unique(); - forward->density_network_input = tcnn::GPUMatrixDynamic{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; - forward->rgb_network_input = tcnn::GPUMatrixDynamic{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()}; + forward->density_network_input = GPUMatrixDynamic{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; + forward->rgb_network_input = GPUMatrixDynamic{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()}; forward->pos_encoding_ctx = m_pos_encoding->forward( stream, @@ -170,14 +172,14 @@ class NerfNetwork : public tcnn::Network { ); if (output) { - forward->rgb_network_output = tcnn::GPUMatrixDynamic{output->data(), m_rgb_network->padded_output_width(), batch_size, output->layout()}; + forward->rgb_network_output = GPUMatrixDynamic{output->data(), m_rgb_network->padded_output_width(), batch_size, output->layout()}; } forward->rgb_network_ctx = m_rgb_network->forward(stream, forward->rgb_network_input, output ? &forward->rgb_network_output : nullptr, use_inference_params, prepare_input_gradients); if (output) { - tcnn::linear_kernel(extract_density, 0, stream, - batch_size, m_dir_encoding->preferred_output_layout() == tcnn::AoS ? forward->density_network_output.stride() : 1, padded_output_width(), forward->density_network_output.data(), output->data()+3 + linear_kernel(extract_density, 0, stream, + batch_size, m_dir_encoding->preferred_output_layout() == AoS ? forward->density_network_output.stride() : 1, padded_output_width(), forward->density_network_output.data(), output->data()+3 ); } @@ -186,33 +188,33 @@ class NerfNetwork : public tcnn::Network { void backward_impl( cudaStream_t stream, - const tcnn::Context& ctx, - const tcnn::GPUMatrixDynamic& input, - const tcnn::GPUMatrixDynamic& output, - const tcnn::GPUMatrixDynamic& dL_doutput, - tcnn::GPUMatrixDynamic* dL_dinput = nullptr, + const Context& ctx, + const GPUMatrixDynamic& input, + const GPUMatrixDynamic& output, + const GPUMatrixDynamic& dL_doutput, + GPUMatrixDynamic* dL_dinput = nullptr, bool use_inference_params = false, - tcnn::EGradientMode param_gradients_mode = tcnn::EGradientMode::Overwrite + GradientMode param_gradients_mode = GradientMode::Overwrite ) override { const auto& forward = dynamic_cast(ctx); // Make sure our teporary buffers have the correct size for the given batch size uint32_t batch_size = input.n(); - tcnn::GPUMatrix dL_drgb{m_rgb_network->padded_output_width(), batch_size, stream}; + GPUMatrix dL_drgb{m_rgb_network->padded_output_width(), batch_size, stream}; CUDA_CHECK_THROW(cudaMemsetAsync(dL_drgb.data(), 0, dL_drgb.n_bytes(), stream)); - tcnn::linear_kernel(extract_rgb, 0, stream, + linear_kernel(extract_rgb, 0, stream, batch_size*3, dL_drgb.m(), dL_doutput.m(), dL_doutput.data(), dL_drgb.data() ); - const tcnn::GPUMatrixDynamic rgb_network_output{(T*)output.data(), m_rgb_network->padded_output_width(), batch_size, output.layout()}; - tcnn::GPUMatrixDynamic dL_drgb_network_input{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()}; + const GPUMatrixDynamic rgb_network_output{(T*)output.data(), m_rgb_network->padded_output_width(), batch_size, output.layout()}; + GPUMatrixDynamic dL_drgb_network_input{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()}; m_rgb_network->backward(stream, *forward.rgb_network_ctx, forward.rgb_network_input, rgb_network_output, dL_drgb, &dL_drgb_network_input, use_inference_params, param_gradients_mode); // Backprop through dir encoding if it is trainable or if we need input gradients if (m_dir_encoding->n_params() > 0 || dL_dinput) { - tcnn::GPUMatrixDynamic dL_ddir_encoding_output = dL_drgb_network_input.slice_rows(m_density_network->padded_output_width(), m_dir_encoding->padded_output_width()); - tcnn::GPUMatrixDynamic dL_ddir_encoding_input; + GPUMatrixDynamic dL_ddir_encoding_output = dL_drgb_network_input.slice_rows(m_density_network->padded_output_width(), m_dir_encoding->padded_output_width()); + GPUMatrixDynamic dL_ddir_encoding_input; if (dL_dinput) { dL_ddir_encoding_input = dL_dinput->slice_rows(m_dir_offset, m_dir_encoding->input_width()); } @@ -229,25 +231,25 @@ class NerfNetwork : public tcnn::Network { ); } - tcnn::GPUMatrixDynamic dL_ddensity_network_output = dL_drgb_network_input.slice_rows(0, m_density_network->padded_output_width()); - tcnn::linear_kernel(add_density_gradient, 0, stream, + GPUMatrixDynamic dL_ddensity_network_output = dL_drgb_network_input.slice_rows(0, m_density_network->padded_output_width()); + linear_kernel(add_density_gradient, 0, stream, batch_size, dL_doutput.m(), dL_doutput.data(), - dL_ddensity_network_output.layout() == tcnn::RM ? 1 : dL_ddensity_network_output.stride(), + dL_ddensity_network_output.layout() == RM ? 1 : dL_ddensity_network_output.stride(), dL_ddensity_network_output.data() ); - tcnn::GPUMatrixDynamic dL_ddensity_network_input; + GPUMatrixDynamic dL_ddensity_network_input; if (m_pos_encoding->n_params() > 0 || dL_dinput) { - dL_ddensity_network_input = tcnn::GPUMatrixDynamic{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; + dL_ddensity_network_input = GPUMatrixDynamic{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; } m_density_network->backward(stream, *forward.density_network_ctx, forward.density_network_input, forward.density_network_output, dL_ddensity_network_output, dL_ddensity_network_input.data() ? &dL_ddensity_network_input : nullptr, use_inference_params, param_gradients_mode); // Backprop through pos encoding if it is trainable or if we need input gradients if (dL_ddensity_network_input.data()) { - tcnn::GPUMatrixDynamic dL_dpos_encoding_input; + GPUMatrixDynamic dL_dpos_encoding_input; if (dL_dinput) { dL_dpos_encoding_input = dL_dinput->slice_rows(0, m_pos_encoding->input_width()); } @@ -265,26 +267,19 @@ class NerfNetwork : public tcnn::Network { } } - void density(cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, tcnn::GPUMatrixDynamic& output, bool use_inference_params = true) { - if (input.layout() != tcnn::CM) { + void density(cudaStream_t stream, const GPUMatrixDynamic& input, GPUMatrixDynamic& output, bool use_inference_params = true) { + if (input.layout() != CM) { throw std::runtime_error("NerfNetwork::density input must be in column major format."); } uint32_t batch_size = output.n(); - tcnn::GPUMatrixDynamic density_network_input{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; + GPUMatrixDynamic density_network_input{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; - m_pos_encoding->inference_mixed_precision( - stream, - input.slice_rows(0, m_pos_encoding->input_width()), - density_network_input, - use_inference_params - ); - - m_density_network->inference_mixed_precision(stream, density_network_input, output, use_inference_params); + m_density_model->inference_mixed_precision(stream, input.slice_rows(0, m_pos_encoding->input_width()), output, use_inference_params); } - std::unique_ptr density_forward(cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, tcnn::GPUMatrixDynamic* output = nullptr, bool use_inference_params = false, bool prepare_input_gradients = false) { - if (input.layout() != tcnn::CM) { + std::unique_ptr density_forward(cudaStream_t stream, const GPUMatrixDynamic& input, GPUMatrixDynamic* output = nullptr, bool use_inference_params = false, bool prepare_input_gradients = false) { + if (input.layout() != CM) { throw std::runtime_error("NerfNetwork::density_forward input must be in column major format."); } @@ -293,7 +288,7 @@ class NerfNetwork : public tcnn::Network { auto forward = std::make_unique(); - forward->density_network_input = tcnn::GPUMatrixDynamic{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; + forward->density_network_input = GPUMatrixDynamic{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; forward->pos_encoding_ctx = m_pos_encoding->forward( stream, @@ -304,7 +299,7 @@ class NerfNetwork : public tcnn::Network { ); if (output) { - forward->density_network_output = tcnn::GPUMatrixDynamic{output->data(), m_density_network->padded_output_width(), batch_size, output->layout()}; + forward->density_network_output = GPUMatrixDynamic{output->data(), m_density_network->padded_output_width(), batch_size, output->layout()}; } forward->density_network_ctx = m_density_network->forward(stream, forward->density_network_input, output ? &forward->density_network_output : nullptr, use_inference_params, prepare_input_gradients); @@ -314,15 +309,15 @@ class NerfNetwork : public tcnn::Network { void density_backward( cudaStream_t stream, - const tcnn::Context& ctx, - const tcnn::GPUMatrixDynamic& input, - const tcnn::GPUMatrixDynamic& output, - const tcnn::GPUMatrixDynamic& dL_doutput, - tcnn::GPUMatrixDynamic* dL_dinput = nullptr, + const Context& ctx, + const GPUMatrixDynamic& input, + const GPUMatrixDynamic& output, + const GPUMatrixDynamic& dL_doutput, + GPUMatrixDynamic* dL_dinput = nullptr, bool use_inference_params = false, - tcnn::EGradientMode param_gradients_mode = tcnn::EGradientMode::Overwrite + GradientMode param_gradients_mode = GradientMode::Overwrite ) { - if (input.layout() != tcnn::CM || (dL_dinput && dL_dinput->layout() != tcnn::CM)) { + if (input.layout() != CM || (dL_dinput && dL_dinput->layout() != CM)) { throw std::runtime_error("NerfNetwork::density_backward input must be in column major format."); } @@ -331,16 +326,16 @@ class NerfNetwork : public tcnn::Network { // Make sure our temporary buffers have the correct size for the given batch size uint32_t batch_size = input.n(); - tcnn::GPUMatrixDynamic dL_ddensity_network_input; + GPUMatrixDynamic dL_ddensity_network_input; if (m_pos_encoding->n_params() > 0 || dL_dinput) { - dL_ddensity_network_input = tcnn::GPUMatrixDynamic{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; + dL_ddensity_network_input = GPUMatrixDynamic{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()}; } m_density_network->backward(stream, *forward.density_network_ctx, forward.density_network_input, output, dL_doutput, dL_ddensity_network_input.data() ? &dL_ddensity_network_input : nullptr, use_inference_params, param_gradients_mode); // Backprop through pos encoding if it is trainable or if we need input gradients if (dL_ddensity_network_input.data()) { - tcnn::GPUMatrixDynamic dL_dpos_encoding_input; + GPUMatrixDynamic dL_dpos_encoding_input; if (dL_dinput) { dL_dpos_encoding_input = dL_dinput->slice_rows(0, m_pos_encoding->input_width()); } @@ -359,6 +354,8 @@ class NerfNetwork : public tcnn::Network { } void set_params_impl(T* params, T* inference_params, T* gradients) override { + m_density_model->set_params(params, inference_params, gradients); + size_t offset = 0; m_density_network->set_params(params + offset, inference_params + offset, gradients + offset); offset += m_density_network->n_params(); @@ -373,7 +370,7 @@ class NerfNetwork : public tcnn::Network { offset += m_dir_encoding->n_params(); } - void initialize_params(tcnn::pcg32& rnd, float* params_full_precision, float scale = 1) override { + void initialize_params(pcg32& rnd, float* params_full_precision, float scale = 1) override { m_density_network->initialize_params(rnd, params_full_precision, scale); params_full_precision += m_density_network->n_params(); @@ -434,7 +431,7 @@ class NerfNetwork : public tcnn::Network { return m_density_network->num_forward_activations() + m_rgb_network->num_forward_activations() + 2; } - std::pair forward_activations(const tcnn::Context& ctx, uint32_t layer) const override { + std::pair forward_activations(const Context& ctx, uint32_t layer) const override { const auto& forward = dynamic_cast(ctx); if (layer == 0) { return {forward.density_network_input.data(), m_pos_encoding->preferred_output_layout()}; @@ -447,23 +444,23 @@ class NerfNetwork : public tcnn::Network { } } - const std::shared_ptr>& pos_encoding() const { + const std::shared_ptr>& pos_encoding() const { return m_pos_encoding; } - const std::shared_ptr>& dir_encoding() const { + const std::shared_ptr>& dir_encoding() const { return m_dir_encoding; } - const std::shared_ptr>& density_network() const { + const std::shared_ptr>& density_network() const { return m_density_network; } - const std::shared_ptr>& rgb_network() const { + const std::shared_ptr>& rgb_network() const { return m_rgb_network; } - tcnn::json hyperparams() const override { + json hyperparams() const override { json density_network_hyperparams = m_density_network->hyperparams(); density_network_hyperparams["n_output_dims"] = m_density_network->padded_output_width(); return { @@ -476,10 +473,13 @@ class NerfNetwork : public tcnn::Network { } private: - std::shared_ptr> m_density_network; - std::shared_ptr> m_rgb_network; - std::shared_ptr> m_pos_encoding; - std::shared_ptr> m_dir_encoding; + std::shared_ptr> m_density_network; + std::shared_ptr> m_rgb_network; + std::shared_ptr> m_pos_encoding; + std::shared_ptr> m_dir_encoding; + + // Aggregates m_pos_encoding and m_density_network + std::shared_ptr> m_density_model; uint32_t m_rgb_network_input_width; uint32_t m_n_pos_dims; @@ -488,11 +488,11 @@ class NerfNetwork : public tcnn::Network { uint32_t m_dir_offset; // // Storage of forward pass data - struct ForwardContext : public tcnn::Context { - tcnn::GPUMatrixDynamic density_network_input; - tcnn::GPUMatrixDynamic density_network_output; - tcnn::GPUMatrixDynamic rgb_network_input; - tcnn::GPUMatrix rgb_network_output; + struct ForwardContext : public Context { + GPUMatrixDynamic density_network_input; + GPUMatrixDynamic density_network_output; + GPUMatrixDynamic rgb_network_input; + GPUMatrix rgb_network_output; std::unique_ptr pos_encoding_ctx; std::unique_ptr dir_encoding_ctx; @@ -502,4 +502,4 @@ class NerfNetwork : public tcnn::Network { }; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/openxr_hmd.h b/include/neural-graphics-primitives/openxr_hmd.h index ed3f5613e..f7150d794 100644 --- a/include/neural-graphics-primitives/openxr_hmd.h +++ b/include/neural-graphics-primitives/openxr_hmd.h @@ -43,7 +43,7 @@ #pragma GCC diagnostic ignored "-Wmissing-field-initializers" //TODO: XR struct are uninitiaized apart from their type #endif -NGP_NAMESPACE_BEGIN +namespace ngp { enum class EEnvironmentBlendMode { Opaque = XR_ENVIRONMENT_BLEND_MODE_OPAQUE, @@ -284,7 +284,7 @@ class OpenXRHMD { const bool m_print_reference_spaces = false; }; -NGP_NAMESPACE_END +} #ifdef __GNUC__ #pragma GCC diagnostic pop diff --git a/include/neural-graphics-primitives/random_val.cuh b/include/neural-graphics-primitives/random_val.cuh index 08314df64..4399b7a73 100644 --- a/include/neural-graphics-primitives/random_val.cuh +++ b/include/neural-graphics-primitives/random_val.cuh @@ -19,13 +19,13 @@ #include -#include +#include -NGP_NAMESPACE_BEGIN +namespace ngp { -using default_rng_t = tcnn::default_rng_t; +using default_rng_t = pcg32; -inline constexpr float PI() { return 3.14159265358979323846f; } +inline constexpr NGP_HOST_DEVICE float PI() { return 3.14159265358979323846f; } template inline __host__ __device__ float random_val(RNG& rng) { @@ -55,14 +55,14 @@ inline __host__ __device__ vec3 cylindrical_to_dir(const vec2& p) { inline __host__ __device__ vec2 dir_to_cylindrical(const vec3& d) { const float cos_theta = fminf(fmaxf(-d.z, -1.0f), 1.0f); - float phi = std::atan2(d.y, d.x); + float phi = atan2(d.y, d.x); return {(cos_theta + 1.0f) / 2.0f, (phi / (2.0f * PI())) + 0.5f}; } inline __host__ __device__ vec2 dir_to_spherical(const vec3& d) { const float cos_theta = fminf(fmaxf(d.z, -1.0f), 1.0f); const float theta = acosf(cos_theta); - float phi = std::atan2(d.y, d.x); + float phi = atan2(d.y, d.x); return {theta, phi}; } @@ -324,5 +324,5 @@ inline __host__ __device__ vec2 ld_random_pixel_offset(const uint32_t spp) { return offset; } -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/render_buffer.h b/include/neural-graphics-primitives/render_buffer.h index 64c6e9ca3..6bbb4bad3 100644 --- a/include/neural-graphics-primitives/render_buffer.h +++ b/include/neural-graphics-primitives/render_buffer.h @@ -14,7 +14,7 @@ #pragma once -#include +#include #include #include @@ -23,7 +23,7 @@ #include #include -NGP_NAMESPACE_BEGIN +namespace ngp { typedef unsigned int GLenum; typedef int GLint; @@ -306,9 +306,9 @@ class CudaRenderBuffer { ivec2 m_in_resolution = ivec2(0); - tcnn::GPUMemory m_frame_buffer; - tcnn::GPUMemory m_depth_buffer; - tcnn::GPUMemory m_accumulate_buffer; + GPUMemory m_frame_buffer; + GPUMemory m_depth_buffer; + GPUMemory m_accumulate_buffer; std::shared_ptr> m_hidden_area_mask = nullptr; @@ -316,4 +316,4 @@ class CudaRenderBuffer { std::shared_ptr m_depth_target; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/sdf.h b/include/neural-graphics-primitives/sdf.h index 3b375a982..0d28f10e9 100644 --- a/include/neural-graphics-primitives/sdf.h +++ b/include/neural-graphics-primitives/sdf.h @@ -18,7 +18,7 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { struct SdfPayload { vec3 dir; @@ -28,7 +28,7 @@ struct SdfPayload { }; struct RaysSdfSoa { -#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__)) +#if defined(__CUDACC__) || (defined(__clang__) && defined(__CUDA__)) void copy_from_other_async(uint32_t n_elements, const RaysSdfSoa& other, cudaStream_t stream) { CUDA_CHECK_THROW(cudaMemcpyAsync(pos, other.pos, n_elements * sizeof(vec3), cudaMemcpyDeviceToDevice, stream)); CUDA_CHECK_THROW(cudaMemcpyAsync(normal, other.normal, n_elements * sizeof(vec3), cudaMemcpyDeviceToDevice, stream)); @@ -67,8 +67,8 @@ struct BRDFParams { float sheen=0.f; float clearcoat=0.f; float clearcoat_gloss=0.f; - vec3 basecolor=vec3(0.8f,0.8f,0.8f); - vec3 ambientcolor=vec3(0.f,0.f,0.f); + vec3 basecolor = {0.8f, 0.8f, 0.8f}; + vec3 ambientcolor = {0.0f, 0.0f, 0.0f}; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/shared_queue.h b/include/neural-graphics-primitives/shared_queue.h index 87629f7e8..61b5f91f2 100644 --- a/include/neural-graphics-primitives/shared_queue.h +++ b/include/neural-graphics-primitives/shared_queue.h @@ -17,7 +17,7 @@ #include #include -NGP_NAMESPACE_BEGIN +namespace ngp { class ICallable { public: @@ -117,4 +117,4 @@ class SharedQueue { std::condition_variable mDataCondition; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/takikawa_encoding.cuh b/include/neural-graphics-primitives/takikawa_encoding.cuh index 04bad458c..344150542 100644 --- a/include/neural-graphics-primitives/takikawa_encoding.cuh +++ b/include/neural-graphics-primitives/takikawa_encoding.cuh @@ -23,19 +23,19 @@ #include #include -NGP_NAMESPACE_BEGIN +namespace ngp { template __global__ void kernel_takikawa( const uint32_t num_elements, const uint32_t n_levels, const uint32_t starting_level, - const tcnn::InterpolationType interpolation_type, + const InterpolationType interpolation_type, const TriangleOctreeNode* octree_nodes, const TriangleOctreeDualNode* octree_dual_nodes, const T* __restrict__ grid, - const tcnn::MatrixView data_in, - tcnn::MatrixView data_out, + const MatrixView data_in, + MatrixView data_out, float* __restrict__ dy_dx ) { uint32_t n_features = N_FEATURES_PER_LEVEL * n_levels; @@ -61,7 +61,7 @@ __global__ void kernel_takikawa( vec3 pos_derivative; - if (interpolation_type == tcnn::InterpolationType::Linear) { + if (interpolation_type == InterpolationType::Linear) { NGP_PRAGMA_UNROLL for (uint32_t dim = 0; dim < 3; ++dim) { pos_derivative[dim] = 1.0f; @@ -69,14 +69,14 @@ __global__ void kernel_takikawa( } else { NGP_PRAGMA_UNROLL for (uint32_t dim = 0; dim < 3; ++dim) { - pos_derivative[dim] = tcnn::smoothstep_derivative(pos[dim]); - pos[dim] = tcnn::smoothstep(pos[dim]); + pos_derivative[dim] = smoothstep_derivative(pos[dim]); + pos[dim] = smoothstep(pos[dim]); } } if (data_out) { // Tri-linear interpolation - tcnn::vector_t result = {(T)0.0f}; + tvec result = {(T)0.0f}; NGP_PRAGMA_UNROLL for (uint32_t idx = 0; idx < 8; ++idx) { @@ -92,7 +92,7 @@ __global__ void kernel_takikawa( } int param_idx = node.vertices[idx] * N_FEATURES_PER_LEVEL; - result = fma((T)weight, *(tcnn::vector_t*)&grid[param_idx], result); + result = fma((T)weight, *(tvec*)&grid[param_idx], result); } NGP_PRAGMA_UNROLL @@ -107,7 +107,7 @@ __global__ void kernel_takikawa( NGP_PRAGMA_UNROLL for (uint32_t grad_dim = 0; grad_dim < 3; ++grad_dim) { - tcnn::vector_fullp_t grad = {0.0f}; + vec grad = {0.0f}; NGP_PRAGMA_UNROLL for (uint32_t idx = 0; idx < 4; ++idx) { @@ -127,11 +127,11 @@ __global__ void kernel_takikawa( } int param_idx = node.vertices[child_idx] * N_FEATURES_PER_LEVEL; - auto val_left = *(tcnn::vector_t*)&grid[param_idx]; + auto val_left = *(tvec*)&grid[param_idx]; child_idx |= 1 << grad_dim; param_idx = node.vertices[child_idx] * N_FEATURES_PER_LEVEL; - auto val_right = *(tcnn::vector_t*)&grid[param_idx]; + auto val_right = *(tvec*)&grid[param_idx]; NGP_PRAGMA_UNROLL for (uint32_t feature = 0; feature < N_FEATURES_PER_LEVEL; ++feature) { @@ -140,7 +140,7 @@ __global__ void kernel_takikawa( } const uint32_t fan_out_grad = n_features * 3; - *(tcnn::vector_fullp_t*)&dy_dx[i * fan_out_grad + level * N_FEATURES_PER_LEVEL + grad_dim * n_features] = grad; + *(vec*)&dy_dx[i * fan_out_grad + level * N_FEATURES_PER_LEVEL + grad_dim * n_features] = grad; } } } @@ -162,9 +162,9 @@ template __global__ void kernel_takikawa_backward_input( const uint32_t num_elements, const uint32_t num_grid_features, - const tcnn::MatrixView dL_dy, + const MatrixView dL_dy, const float* __restrict__ dy_dx, - tcnn::MatrixView dL_dx + MatrixView dL_dx ) { const uint32_t input_index = threadIdx.x + blockIdx.x * blockDim.x; if (input_index >= num_elements) return; @@ -186,12 +186,12 @@ __global__ void kernel_takikawa_backward( const uint32_t num_elements, const uint32_t n_levels, const uint32_t starting_level, - const tcnn::InterpolationType interpolation_type, + const InterpolationType interpolation_type, const TriangleOctreeNode* octree_nodes, const TriangleOctreeDualNode* octree_dual_nodes, - GRAD_T* __restrict__ params_gradient, - const tcnn::MatrixView data_in, - const tcnn::MatrixView dL_dy + GRAD_T* __restrict__ param_gradients, + const MatrixView data_in, + const MatrixView dL_dy ) { uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; const uint32_t encoded_index = i * N_FEATURES_PER_LEVEL * n_levels; @@ -212,14 +212,14 @@ __global__ void kernel_takikawa_backward( } level -= starting_level; - if (interpolation_type == tcnn::InterpolationType::Smoothstep) { + if (interpolation_type == InterpolationType::Smoothstep) { NGP_PRAGMA_UNROLL for (uint32_t dim = 0; dim < 3; ++dim) { - pos[dim] = tcnn::smoothstep(pos[dim]); + pos[dim] = smoothstep(pos[dim]); } } - tcnn::vector_t grad; + tvec grad; NGP_PRAGMA_UNROLL for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) { @@ -248,7 +248,7 @@ __global__ void kernel_takikawa_backward( NGP_PRAGMA_UNROLL for (uint32_t feature = 0; feature < N_FEATURES_PER_LEVEL; feature += 2) { __half2 v = {(__half)((float)grad[feature] * weight), (__half)((float)grad[feature+1] * weight)}; - atomicAdd((__half2*)¶ms_gradient[param_idx + feature], v); + atomicAdd((__half2*)¶m_gradients[param_idx + feature], v); } } else #endif @@ -259,7 +259,7 @@ __global__ void kernel_takikawa_backward( } else { NGP_PRAGMA_UNROLL for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) { - atomicAdd((float*)¶ms_gradient[param_idx], (float)grad[f] * weight); + atomicAdd((float*)¶m_gradients[param_idx], (float)grad[f] * weight); } } } @@ -269,7 +269,7 @@ __global__ void kernel_takikawa_backward( } template -class TakikawaEncoding : public tcnn::Encoding { +class TakikawaEncoding : public Encoding { public: #if TCNN_MIN_GPU_ARCH >= 60 // The GPUs that we tested this on do not have an efficient 1D fp16 @@ -284,7 +284,7 @@ public: using grad_t = float; #endif - TakikawaEncoding(uint32_t starting_level, std::shared_ptr octree, tcnn::InterpolationType interpolation_type) + TakikawaEncoding(uint32_t starting_level, std::shared_ptr octree, InterpolationType interpolation_type) : m_starting_level{starting_level}, m_octree{octree}, m_interpolation_type{interpolation_type} { if (m_starting_level >= m_octree->depth()) { @@ -300,10 +300,10 @@ public: virtual ~TakikawaEncoding() { } - std::unique_ptr forward_impl( + std::unique_ptr forward_impl( cudaStream_t stream, - const tcnn::GPUMatrixDynamic& input, - tcnn::GPUMatrixDynamic* output = nullptr, + const GPUMatrixDynamic& input, + GPUMatrixDynamic* output = nullptr, bool use_inference_params = false, bool prepare_input_gradients = false ) override { @@ -314,10 +314,10 @@ public: } if (prepare_input_gradients) { - forward->dy_dx = tcnn::GPUMatrix{3 * N_FEATURES_PER_LEVEL * n_levels(), input.n(), stream}; + forward->dy_dx = GPUMatrix{3 * N_FEATURES_PER_LEVEL * n_levels(), input.n(), stream}; } - tcnn::linear_kernel(kernel_takikawa, 0, stream, + linear_kernel(kernel_takikawa, 0, stream, input.n(), n_levels(), m_starting_level, @@ -326,7 +326,7 @@ public: m_octree->dual_nodes_gpu(), use_inference_params ? this->inference_params() : this->params(), input.view(), - output ? output->view() : tcnn::MatrixView{}, + output ? output->view() : MatrixView{}, forward->dy_dx.data() ); @@ -335,13 +335,13 @@ public: void backward_impl( cudaStream_t stream, - const tcnn::Context& ctx, - const tcnn::GPUMatrixDynamic& input, - const tcnn::GPUMatrixDynamic& output, - const tcnn::GPUMatrixDynamic& dL_doutput, - tcnn::GPUMatrixDynamic* dL_dinput = nullptr, + const Context& ctx, + const GPUMatrixDynamic& input, + const GPUMatrixDynamic& output, + const GPUMatrixDynamic& dL_doutput, + GPUMatrixDynamic* dL_dinput = nullptr, bool use_inference_params = false, - tcnn::EGradientMode param_gradients_mode = tcnn::EGradientMode::Overwrite + GradientMode param_gradients_mode = GradientMode::Overwrite ) override { const uint32_t num_elements = input.n(); if (padded_output_width() == 0 || num_elements == 0) { @@ -350,37 +350,37 @@ public: const auto& forward = dynamic_cast(ctx); - if (param_gradients_mode != tcnn::EGradientMode::Ignore) { + if (param_gradients_mode != GradientMode::Ignore) { // We accumulate gradients with grad_t precision, which, for performance reasons, is not always T. // If not, accumulate in a temporary buffer and cast later. - grad_t* params_gradient; - tcnn::GPUMemoryArena::Allocation params_gradient_tmp; + grad_t* param_gradients; + GPUMemoryArena::Allocation param_gradients_tmp; if (!std::is_same::value) { - params_gradient_tmp = tcnn::allocate_workspace(stream, n_params() * sizeof(grad_t)); - params_gradient = (grad_t*)params_gradient_tmp.data(); + param_gradients_tmp = allocate_workspace(stream, n_params() * sizeof(grad_t)); + param_gradients = (grad_t*)param_gradients_tmp.data(); } else { - params_gradient = (grad_t*)this->gradients(); + param_gradients = (grad_t*)this->gradients(); } - if (param_gradients_mode == tcnn::EGradientMode::Overwrite) { - CUDA_CHECK_THROW(cudaMemsetAsync(params_gradient, 0, n_params() * sizeof(grad_t), stream)); + if (param_gradients_mode == GradientMode::Overwrite) { + CUDA_CHECK_THROW(cudaMemsetAsync(param_gradients, 0, n_params() * sizeof(grad_t), stream)); } - tcnn::linear_kernel(kernel_takikawa_backward, 0, stream, + linear_kernel(kernel_takikawa_backward, 0, stream, num_elements, n_levels(), m_starting_level, m_interpolation_type, m_octree->nodes_gpu(), m_octree->dual_nodes_gpu(), - params_gradient, + param_gradients, input.view(), dL_doutput.view() ); if (!std::is_same::value) { - parallel_for_gpu(stream, n_params(), [grad=this->gradients(), grad_tmp=params_gradient] __device__ (size_t i) { + parallel_for_gpu(stream, n_params(), [grad=this->gradients(), grad_tmp=param_gradients] __device__ (size_t i) { grad[i] = (T)grad_tmp[i]; }); } @@ -388,7 +388,7 @@ public: // Gradient computation w.r.t. input if (dL_dinput) { - tcnn::linear_kernel(kernel_takikawa_backward_input, 0, stream, + linear_kernel(kernel_takikawa_backward_input, 0, stream, num_elements * input_width(), N_FEATURES_PER_LEVEL * n_levels(), dL_doutput.view(), @@ -424,9 +424,9 @@ public: void set_params_impl(T* params, T* inference_params, T* gradients) override { } - void initialize_params(tcnn::pcg32& rnd, float* params_full_precision, float scale = 1) override { + void initialize_params(pcg32& rnd, float* params_full_precision, float scale = 1) override { // Initialize the encoding from the GPU, because the number of parameters can be quite large. - tcnn::generate_random_uniform(rnd, n_params(), params_full_precision, -1e-4f * scale, 1e-4f * scale); + generate_random_uniform(rnd, n_params(), params_full_precision, -1e-4f * scale, 1e-4f * scale); } size_t n_params() const override { @@ -437,11 +437,11 @@ public: return m_octree->depth() - m_starting_level; } - tcnn::MatrixLayout preferred_output_layout() const override { - return tcnn::AoS; + MatrixLayout preferred_output_layout() const override { + return AoS; } - tcnn::json hyperparams() const override { + json hyperparams() const override { return { {"otype", "Takikawa"}, {"starting_level", m_starting_level}, @@ -450,8 +450,8 @@ public: } private: - struct ForwardContext : public tcnn::Context { - tcnn::GPUMatrix dy_dx; + struct ForwardContext : public Context { + GPUMatrix dy_dx; }; uint32_t m_starting_level; @@ -462,7 +462,7 @@ private: uint32_t m_n_to_pad = 0; std::shared_ptr m_octree; - tcnn::InterpolationType m_interpolation_type; + InterpolationType m_interpolation_type; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h index e8eae9726..75aa028dd 100644 --- a/include/neural-graphics-primitives/testbed.h +++ b/include/neural-graphics-primitives/testbed.h @@ -44,16 +44,16 @@ struct GLFWwindow; -TCNN_NAMESPACE_BEGIN +namespace tcnn { template class Loss; template class Optimizer; template class Encoding; template class Network; template class Trainer; template class TrainableBuffer; -TCNN_NAMESPACE_END +} -NGP_NAMESPACE_BEGIN +namespace ngp { template class NerfNetwork; class TriangleOctree; @@ -137,7 +137,7 @@ class Testbed { float m_shadow_sharpness = 2048.f; bool m_trace_shadow_rays = false; - tcnn::GPUMemoryArena::Allocation m_scratch_alloc; + GPUMemoryArena::Allocation m_scratch_alloc; }; class NerfTracer { @@ -177,7 +177,7 @@ class Testbed { ); uint32_t trace( - NerfNetwork& network, + const std::shared_ptr>& network, const BoundingBox& render_aabb, const mat3& render_aabb_to_local, const BoundingBox& train_aabb, @@ -208,12 +208,12 @@ class Testbed { private: RaysNerfSoa m_rays[2]; RaysNerfSoa m_rays_hit; - precision_t* m_network_output; + network_precision_t* m_network_output; float* m_network_input; uint32_t* m_hit_counter; uint32_t* m_alive_counter; uint32_t m_n_rays_initialized = 0; - tcnn::GPUMemoryArena::Allocation m_scratch_alloc; + GPUMemoryArena::Allocation m_scratch_alloc; }; class FiniteDifferenceNormalsApproximator { @@ -234,7 +234,7 @@ class Testbed { float* dist_dy_neg; float* dist_dz_neg; - tcnn::GPUMemoryArena::Allocation m_scratch_alloc; + GPUMemoryArena::Allocation m_scratch_alloc; }; struct LevelStats { @@ -257,7 +257,7 @@ class Testbed { // underflow (round to zero) in the gradient computations. Hence, // scale the loss (and thereby gradients) up by this factor and // divide it out in the optimizer later on. - static constexpr float LOSS_SCALE = 128.0f; + static constexpr float LOSS_SCALE() { return default_loss_scale(); } struct NetworkDims { uint32_t n_input; @@ -278,11 +278,11 @@ class Testbed { class CudaDevice; - const float* get_inference_extra_dims(cudaStream_t stream) const; void render_nerf( cudaStream_t stream, + CudaDevice& device, const CudaRenderBufferView& render_buffer, - NerfNetwork& nerf_network, + const std::shared_ptr>& nerf_network, const uint8_t* density_grid_bitfield, const vec2& focal_length, const mat4x3& camera_matrix0, @@ -396,7 +396,6 @@ class Testbed { void last_training_view(); void previous_training_view(); void next_training_view(); - void add_training_views_to_camera_path(); void set_camera_to_training_view(int trainview); void reset_camera(); bool keyboard_event(); @@ -406,9 +405,9 @@ class Testbed { void mark_density_grid_in_sphere_empty(const vec3& pos, float radius, cudaStream_t stream); struct NerfCounters { - tcnn::GPUMemory numsteps_counter; // number of steps each ray took - tcnn::GPUMemory numsteps_counter_compacted; // number of steps each ray took - tcnn::GPUMemory loss; + GPUMemory numsteps_counter; // number of steps each ray took + GPUMemory numsteps_counter_compacted; // number of steps each ray took + GPUMemory loss; uint32_t rays_per_batch = 1<<12; uint32_t n_rays_total = 0; @@ -438,13 +437,13 @@ class Testbed { vec2 render_screen_center(const vec2& screen_center) const; void optimise_mesh_step(uint32_t N_STEPS); void compute_mesh_vertex_colors(); - tcnn::GPUMemory get_density_on_grid(ivec3 res3d, const BoundingBox& aabb, const mat3& render_aabb_to_local); // network version (nerf or sdf) - tcnn::GPUMemory get_sdf_gt_on_grid(ivec3 res3d, const BoundingBox& aabb, const mat3& render_aabb_to_local); // sdf gt version (sdf only) - tcnn::GPUMemory get_rgba_on_grid(ivec3 res3d, vec3 ray_dir, bool voxel_centers, float depth, bool density_as_alpha = false); + GPUMemory get_density_on_grid(ivec3 res3d, const BoundingBox& aabb, const mat3& render_aabb_to_local); // network version (nerf or sdf) + GPUMemory get_sdf_gt_on_grid(ivec3 res3d, const BoundingBox& aabb, const mat3& render_aabb_to_local); // sdf gt version (sdf only) + GPUMemory get_rgba_on_grid(ivec3 res3d, vec3 ray_dir, bool voxel_centers, float depth, bool density_as_alpha = false); int marching_cubes(ivec3 res3d, const BoundingBox& render_aabb, const mat3& render_aabb_to_local, float thresh); float get_depth_from_renderbuffer(const CudaRenderBuffer& render_buffer, const vec2& uv); - vec3 get_3d_pos_from_pixel(const CudaRenderBuffer& render_buffer, const ivec2& focus_pixel); + vec3 get_3d_pos_from_pixel(const CudaRenderBuffer& render_buffer, const vec2& focus_pixel); void autofocus(); size_t n_params(); size_t first_encoder_param(); @@ -467,7 +466,6 @@ class Testbed { void init_vr(); void update_vr_performance_settings(); void apply_camera_smoothing(float elapsed_ms); - int find_best_training_view(int default_view); bool begin_frame(); void handle_user_input(); vec3 vr_to_world(const vec3& pos) const; @@ -501,6 +499,7 @@ class Testbed { ivec3 compute_and_save_png_slices(const fs::path& filename, int res, BoundingBox aabb = {}, float thresh = 2.5f, float density_range = 4.f, bool flip_y_and_z_axes = false); fs::path root_dir(); + void set_root_dir(const fs::path& dir); //////////////////////////////////////////////////////////////// // marching cubes related state @@ -512,14 +511,14 @@ class Testbed { float density_amount = 128.f; float inflate_amount = 1.f; bool optimize_mesh = false; - tcnn::GPUMemory verts; - tcnn::GPUMemory vert_normals; - tcnn::GPUMemory vert_colors; - tcnn::GPUMemory verts_smoothed; // homogenous - tcnn::GPUMemory indices; - tcnn::GPUMemory verts_gradient; + GPUMemory verts; + GPUMemory vert_normals; + GPUMemory vert_colors; + GPUMemory verts_smoothed; // homogenous + GPUMemory indices; + GPUMemory verts_gradient; std::shared_ptr> trainable_verts; - std::shared_ptr> verts_optimizer; + std::shared_ptr> verts_optimizer; void clear() { indices={}; @@ -566,8 +565,8 @@ class Testbed { float m_ndc_znear = 1.0f / 32.0f; float m_ndc_zfar = 128.0f; - mat4x3 m_camera = mat4x3(1.0f); - mat4x3 m_smoothed_camera = mat4x3(1.0f); + mat4x3 m_camera = mat4x3::identity(); + mat4x3 m_smoothed_camera = mat4x3::identity(); size_t m_render_skip_due_to_lack_of_camera_movement_counter = 0; bool m_fps_camera = false; @@ -639,10 +638,10 @@ class Testbed { int n_images_for_training_prev = 0; // how many images we saw last time we updated the density grid struct ErrorMap { - tcnn::GPUMemory data; - tcnn::GPUMemory cdf_x_cond_y; - tcnn::GPUMemory cdf_y; - tcnn::GPUMemory cdf_img; + GPUMemory data; + GPUMemory cdf_x_cond_y; + GPUMemory cdf_y; + GPUMemory cdf_img; std::vector pmf_img_cpu; ivec2 resolution = {16, 16}; ivec2 cdf_resolution = {16, 16}; @@ -650,31 +649,31 @@ class Testbed { } error_map; std::vector transforms; - tcnn::GPUMemory transforms_gpu; + GPUMemory transforms_gpu; std::vector cam_pos_gradient; - tcnn::GPUMemory cam_pos_gradient_gpu; + GPUMemory cam_pos_gradient_gpu; std::vector cam_rot_gradient; - tcnn::GPUMemory cam_rot_gradient_gpu; + GPUMemory cam_rot_gradient_gpu; - tcnn::GPUMemory cam_exposure_gpu; + GPUMemory cam_exposure_gpu; std::vector cam_exposure_gradient; - tcnn::GPUMemory cam_exposure_gradient_gpu; + GPUMemory cam_exposure_gradient_gpu; vec2 cam_focal_length_gradient = vec2(0.0f); - tcnn::GPUMemory cam_focal_length_gradient_gpu; + GPUMemory cam_focal_length_gradient_gpu; std::vector> cam_exposure; std::vector> cam_pos_offset; std::vector cam_rot_offset; AdamOptimizer cam_focal_length_offset = AdamOptimizer(0.0f); - tcnn::GPUMemory extra_dims_gpu; // if the model demands a latent code per training image, we put them in here. - tcnn::GPUMemory extra_dims_gradient_gpu; + GPUMemory extra_dims_gpu; // if the model demands a latent code per training image, we put them in here. + GPUMemory extra_dims_gradient_gpu; std::vector extra_dims_opt; - void reset_extra_dims(default_rng_t &rng); + std::vector get_extra_dims_cpu(int trainview) const; float extrinsic_l2_reg = 1e-4f; float extrinsic_learning_rate = 1e-3f; @@ -715,7 +714,7 @@ class Testbed { float depth_supervision_lambda = 0.f; - tcnn::GPUMemory sharpness_grid; + GPUMemory sharpness_grid; void set_camera_intrinsics(int frame_idx, float fx, float fy = 0.0f, float cx = -0.5f, float cy = -0.5f, float k1 = 0.0f, float k2 = 0.0f, float p1 = 0.0f, float p2 = 0.0f, float k3 = 0.0f, float k4 = 0.0f, bool is_fisheye = false); void set_camera_extrinsics_rolling_shutter(int frame_idx, mat4x3 camera_to_world_start, mat4x3 camera_to_world_end, const vec4& rolling_shutter, bool convert_to_ngp = true); @@ -732,10 +731,10 @@ class Testbed { void export_camera_extrinsics(const fs::path& path, bool export_extrinsics_in_quat_format = true); } training = {}; - tcnn::GPUMemory density_grid; // NERF_GRIDSIZE()^3 grid of EMA smoothed densities from the network - tcnn::GPUMemory density_grid_bitfield; + GPUMemory density_grid; // NERF_GRIDSIZE()^3 grid of EMA smoothed densities from the network + GPUMemory density_grid_bitfield; uint8_t* get_density_grid_bitfield_mip(uint32_t mip); - tcnn::GPUMemory density_grid_mean; + GPUMemory density_grid_mean; uint32_t density_grid_ema_step = 0; uint32_t max_cascade = 0; @@ -744,7 +743,12 @@ class Testbed { ENerfActivation density_activation = ENerfActivation::Exponential; vec3 light_dir = vec3(0.5f); - uint32_t extra_dim_idx_for_inference = 0; // which training image's latent code should be presented at inference time + // which training image's latent code should be used for rendering + int rendering_extra_dims_from_training_view = 0; + GPUMemory rendering_extra_dims; + + void reset_extra_dims(default_rng_t &rng); + const float* get_rendering_extra_dims(cudaStream_t stream) const; int show_accel = -1; @@ -757,10 +761,15 @@ class Testbed { Lens render_lens = {}; float render_min_transmittance = 0.01f; + bool render_gbuffer_hard_edges = false; float glow_y_cutoff = 0.f; int glow_mode = 0; + int find_closest_training_view(mat4x3 pose) const; + void set_rendering_extra_dims_from_training_view(int trainview); + void set_rendering_extra_dims(const std::vector& vals); + std::vector get_rendering_extra_dims_cpu() const; } m_nerf; struct Sdf { @@ -776,11 +785,11 @@ class Testbed { EMeshSdfMode mesh_sdf_mode = EMeshSdfMode::Raystab; float mesh_scale; - tcnn::GPUMemory triangles_gpu; + GPUMemory triangles_gpu; std::vector triangles_cpu; std::vector triangle_weights; DiscreteDistribution triangle_distribution; - tcnn::GPUMemory triangle_cdf; + GPUMemory triangle_cdf; std::shared_ptr triangle_bvh; // unique_ptr bool uses_takikawa_encoding = false; @@ -788,7 +797,7 @@ class Testbed { int octree_depth_target = 0; // we duplicate this state so that you can waggle the slider without triggering it immediately std::shared_ptr triangle_octree; - tcnn::GPUMemory brick_data; + GPUMemory brick_data; uint32_t brick_res = 0; uint32_t brick_level = 10; uint32_t brick_quantise_bits = 0; @@ -801,7 +810,7 @@ class Testbed { double iou = 0.0; float iou_decay = 0.0f; bool calculate_iou_online = false; - tcnn::GPUMemory iou_counter; + GPUMemory iou_counter; struct Training { size_t idx = 0; size_t size = 0; @@ -809,11 +818,11 @@ class Testbed { bool did_generate_more_training_data = false; bool generate_sdf_data_online = true; float surface_offset_scale = 1.0f; - tcnn::GPUMemory positions; - tcnn::GPUMemory positions_shuffled; - tcnn::GPUMemory distances; - tcnn::GPUMemory distances_shuffled; - tcnn::GPUMemory perturbations; + GPUMemory positions; + GPUMemory positions_shuffled; + GPUMemory distances; + GPUMemory distances_shuffled; + GPUMemory perturbations; } training = {}; } m_sdf; @@ -823,18 +832,18 @@ class Testbed { }; struct Image { - tcnn::GPUMemory data; + GPUMemory data; EDataType type = EDataType::Float; ivec2 resolution = ivec2(0); - tcnn::GPUMemory render_coords; - tcnn::GPUMemory render_out; + GPUMemory render_coords; + GPUMemory render_out; struct Training { - tcnn::GPUMemory positions_tmp; - tcnn::GPUMemory positions; - tcnn::GPUMemory targets; + GPUMemory positions_tmp; + GPUMemory positions; + GPUMemory targets; bool snap_to_pixel_centers = true; bool linear_colors = false; @@ -853,22 +862,22 @@ class Testbed { float albedo = 0.95f; float scattering = 0.f; float inv_distance_scale = 100.f; - tcnn::GPUMemory nanovdb_grid; - tcnn::GPUMemory bitgrid; + GPUMemory nanovdb_grid; + GPUMemory bitgrid; float global_majorant = 1.f; - vec3 world2index_offset = {0, 0, 0}; + vec3 world2index_offset = {0.0f, 0.0f, 0.0f}; float world2index_scale = 1.f; struct Training { - tcnn::GPUMemory positions = {}; - tcnn::GPUMemory targets = {}; + GPUMemory positions = {}; + GPUMemory targets = {}; } training = {}; // tracing state - tcnn::GPUMemory pos[2] = {}; - tcnn::GPUMemory payload[2] = {}; - tcnn::GPUMemory hit_counter = {}; - tcnn::GPUMemory radiance_and_density; + GPUMemory pos[2] = {}; + GPUMemory payload[2] = {}; + GPUMemory hit_counter = {}; + GPUMemory radiance_and_density; } m_volume; float m_camera_velocity = 1.0f; @@ -886,7 +895,7 @@ class Testbed { BoundingBox m_raw_aabb; BoundingBox m_aabb; BoundingBox m_render_aabb; - mat3 m_render_aabb_to_local = mat3(1.0f); + mat3 m_render_aabb_to_local = mat3::identity(); mat4x3 crop_box(bool nerf_space) const; std::vector crop_box_corners(bool nerf_space) const; @@ -915,9 +924,9 @@ class Testbed { ivec2 full_resolution = {1, 1}; int visualized_dimension = 0; - mat4x3 camera0 = mat4x3(1.0f); - mat4x3 camera1 = mat4x3(1.0f); - mat4x3 prev_camera = mat4x3(1.0f); + mat4x3 camera0 = mat4x3::identity(); + mat4x3 camera1 = mat4x3::identity(); + mat4x3 prev_camera = mat4x3::identity(); Foveation foveation; Foveation prev_foveation; @@ -957,15 +966,15 @@ class Testbed { vec3 m_parallax_shift = {0.0f, 0.0f, 0.0f}; // to shift the viewer's origin by some amount in camera space // CUDA stuff - tcnn::StreamAndEvent m_stream; + StreamAndEvent m_stream; // Hashgrid encoding analysis float m_quant_percent = 0.f; std::vector m_level_stats; std::vector m_first_layer_column_stats; - int m_n_levels = 0; + uint32_t m_n_levels = 0; uint32_t m_n_features_per_level = 0; - int m_histo_level = 0; // collect a histogram for this level + uint32_t m_histo_level = 0; // collect a histogram for this level uint32_t m_base_grid_resolution; float m_per_level_scale; float m_histo[257] = {}; @@ -983,19 +992,14 @@ class Testbed { class CudaDevice { public: struct Data { - tcnn::GPUMemory density_grid_bitfield; + GPUMemory density_grid_bitfield; uint8_t* density_grid_bitfield_ptr; - tcnn::GPUMemory params; + GPUMemory params; std::shared_ptr> hidden_area_mask; }; - CudaDevice(int id, bool is_primary) : m_id{id}, m_is_primary{is_primary} { - auto guard = device_guard(); - m_stream = std::make_unique(); - m_data = std::make_unique(); - m_render_worker = std::make_unique(is_primary ? 0u : 1u); - } + CudaDevice(int id, bool is_primary); CudaDevice(const CudaDevice&) = delete; CudaDevice& operator=(const CudaDevice&) = delete; @@ -1003,17 +1007,7 @@ class Testbed { CudaDevice(CudaDevice&&) = default; CudaDevice& operator=(CudaDevice&&) = default; - tcnn::ScopeGuard device_guard() { - int prev_device = tcnn::cuda_device(); - if (prev_device == m_id) { - return {}; - } - - tcnn::set_cuda_device(m_id); - return tcnn::ScopeGuard{[prev_device]() { - tcnn::set_cuda_device(prev_device); - }}; - } + ScopeGuard device_guard(); int id() const { return m_id; @@ -1024,11 +1018,11 @@ class Testbed { } std::string name() const { - return tcnn::cuda_device_name(m_id); + return cuda_device_name(m_id); } int compute_capability() const { - return tcnn::cuda_compute_capability(m_id); + return cuda_compute_capability(m_id); } cudaStream_t stream() const { @@ -1064,17 +1058,14 @@ class Testbed { m_dirty = value; } - void set_network(const std::shared_ptr>& network) { - m_network = network; - } - - void set_nerf_network(const std::shared_ptr>& nerf_network); + void set_network(const std::shared_ptr>& network); + void set_nerf_network(const std::shared_ptr>& nerf_network); - const std::shared_ptr>& network() const { + const std::shared_ptr>& network() const { return m_network; } - const std::shared_ptr>& nerf_network() const { + const std::shared_ptr>& nerf_network() const { return m_nerf_network; } @@ -1087,7 +1078,7 @@ class Testbed { } template - auto enqueue_task(F&& f) -> std::future> { + auto enqueue_task(F&& f) -> std::future> { if (is_primary()) { return std::async(std::launch::deferred, std::forward(f)); } else { @@ -1098,7 +1089,7 @@ class Testbed { private: int m_id; bool m_is_primary; - std::unique_ptr m_stream; + std::unique_ptr m_stream; struct Event { Event() { CUDA_CHECK_THROW(cudaEventCreate(&event)); @@ -1122,8 +1113,8 @@ class Testbed { std::unique_ptr m_data; CudaRenderBufferView m_render_buffer_view = {}; - std::shared_ptr> m_network; - std::shared_ptr> m_nerf_network; + std::shared_ptr> m_network; + std::shared_ptr> m_nerf_network; bool m_dirty = true; @@ -1131,7 +1122,7 @@ class Testbed { }; void sync_device(CudaRenderBuffer& render_buffer, CudaDevice& device); - tcnn::ScopeGuard use_device(cudaStream_t stream, CudaRenderBuffer& render_buffer, CudaDevice& device); + ScopeGuard use_device(cudaStream_t stream, CudaRenderBuffer& render_buffer, CudaDevice& device); void set_all_devices_dirty(); std::vector m_devices; @@ -1155,7 +1146,6 @@ class Testbed { nlohmann::json m_network_config; - default_rng_t m_rng; CudaRenderBuffer m_windowless_render_surface{std::make_shared()}; @@ -1164,16 +1154,16 @@ class Testbed { uint32_t network_num_forward_activations() const; // Network & training stuff - std::shared_ptr> m_loss; - std::shared_ptr> m_optimizer; - std::shared_ptr> m_encoding; - std::shared_ptr> m_network; - std::shared_ptr> m_trainer; + std::shared_ptr> m_loss; + std::shared_ptr> m_optimizer; + std::shared_ptr> m_encoding; + std::shared_ptr> m_network; + std::shared_ptr> m_trainer; struct TrainableEnvmap { - std::shared_ptr> optimizer; + std::shared_ptr> optimizer; std::shared_ptr> envmap; - std::shared_ptr> trainer; + std::shared_ptr> trainer; ivec2 resolution; ELossType loss_type; @@ -1196,9 +1186,9 @@ class Testbed { } m_envmap; struct TrainableDistortionMap { - std::shared_ptr> optimizer; + std::shared_ptr> optimizer; std::shared_ptr> map; - std::shared_ptr> trainer; + std::shared_ptr> trainer; ivec2 resolution; Buffer2DView inference_view() const { @@ -1217,7 +1207,8 @@ class Testbed { return {(const vec2*)map->params(), resolution}; } } m_distortion; - std::shared_ptr> m_nerf_network; + + std::shared_ptr> m_nerf_network; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/thread_pool.h b/include/neural-graphics-primitives/thread_pool.h index 879888306..099b72177 100644 --- a/include/neural-graphics-primitives/thread_pool.h +++ b/include/neural-graphics-primitives/thread_pool.h @@ -22,7 +22,7 @@ #include #include -NGP_NAMESPACE_BEGIN +namespace ngp { template void wait_all(T&& futures) { @@ -106,4 +106,4 @@ class ThreadPool { std::condition_variable m_task_queue_completed_condition; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/tinyexr_wrapper.h b/include/neural-graphics-primitives/tinyexr_wrapper.h index db690368b..8a82ba91b 100644 --- a/include/neural-graphics-primitives/tinyexr_wrapper.h +++ b/include/neural-graphics-primitives/tinyexr_wrapper.h @@ -16,12 +16,12 @@ #pragma once -#include +#include -NGP_NAMESPACE_BEGIN +namespace ngp { void save_exr(const float* data, int width, int height, int nChannels, int channelStride, const fs::path& path); void load_exr(float** data, int* width, int* height, const fs::path& path); __half* load_exr_to_gpu(int* width, int* height, const fs::path& path, bool fix_premult); -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/tinyobj_loader_wrapper.h b/include/neural-graphics-primitives/tinyobj_loader_wrapper.h index b94859bf4..260bebea2 100644 --- a/include/neural-graphics-primitives/tinyobj_loader_wrapper.h +++ b/include/neural-graphics-primitives/tinyobj_loader_wrapper.h @@ -21,8 +21,8 @@ #include #include -NGP_NAMESPACE_BEGIN +namespace ngp { std::vector load_obj(const fs::path& path); -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/trainable_buffer.cuh b/include/neural-graphics-primitives/trainable_buffer.cuh index dcf24e0c8..8fb03bbbf 100644 --- a/include/neural-graphics-primitives/trainable_buffer.cuh +++ b/include/neural-graphics-primitives/trainable_buffer.cuh @@ -24,45 +24,45 @@ #include #include -NGP_NAMESPACE_BEGIN +namespace ngp { template -class TrainableBuffer : public tcnn::DifferentiableObject { +class TrainableBuffer : public DifferentiableObject { public: template TrainableBuffer(const RES& resolution) { for (uint32_t i = 0; i < RANK; ++i) { m_resolution[i] = resolution[i]; } - m_params_gradient_weight.resize(n_params()); + m_param_gradients_weight.resize(n_params()); } virtual ~TrainableBuffer() { } - void inference_mixed_precision_impl(cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, tcnn::GPUMatrixDynamic& output, bool use_inference_matrices = true) override { + void inference_mixed_precision_impl(cudaStream_t stream, const GPUMatrixDynamic& input, GPUMatrixDynamic& output, bool use_inference_matrices = true) override { throw std::runtime_error{"The trainable buffer does not support inference(). Its content is meant to be used externally."}; } - std::unique_ptr forward_impl(cudaStream_t stream, const tcnn::GPUMatrixDynamic& input, tcnn::GPUMatrixDynamic* output = nullptr, bool use_inference_matrices = false, bool prepare_input_gradients = false) override { + std::unique_ptr forward_impl(cudaStream_t stream, const GPUMatrixDynamic& input, GPUMatrixDynamic* output = nullptr, bool use_inference_matrices = false, bool prepare_input_gradients = false) override { throw std::runtime_error{"The trainable buffer does not support forward(). Its content is meant to be used externally."}; } void backward_impl( cudaStream_t stream, - const tcnn::Context& ctx, - const tcnn::GPUMatrixDynamic& input, - const tcnn::GPUMatrixDynamic& output, - const tcnn::GPUMatrixDynamic& dL_doutput, - tcnn::GPUMatrixDynamic* dL_dinput = nullptr, + const Context& ctx, + const GPUMatrixDynamic& input, + const GPUMatrixDynamic& output, + const GPUMatrixDynamic& dL_doutput, + GPUMatrixDynamic* dL_dinput = nullptr, bool use_inference_matrices = false, - tcnn::EGradientMode param_gradients_mode = tcnn::EGradientMode::Overwrite + GradientMode param_gradients_mode = GradientMode::Overwrite ) override { throw std::runtime_error{"The trainable buffer does not support backward(). Its content is meant to be used externally."}; } void set_params_impl(T* params, T* inference_params, T* gradients) override { } - void initialize_params(tcnn::pcg32& rnd, float* params_full_precision, float scale = 1) override { + void initialize_params(pcg32& rnd, float* params_full_precision, float scale = 1) override { // Initialize the buffer to zero from the GPU CUDA_CHECK_THROW(cudaMemset(params_full_precision, 0, n_params()*sizeof(float))); } @@ -96,10 +96,10 @@ public: } T* gradient_weights() const { - return m_params_gradient_weight.data(); + return m_param_gradients_weight.data(); } - tcnn::json hyperparams() const override { + json hyperparams() const override { return { {"otype", "TrainableBuffer"}, }; @@ -107,7 +107,7 @@ public: private: uint32_t m_resolution[RANK]; - tcnn::GPUMemory m_params_gradient_weight; + GPUMemory m_param_gradients_weight; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/triangle.cuh b/include/neural-graphics-primitives/triangle.cuh index 75ed25793..54fa30690 100644 --- a/include/neural-graphics-primitives/triangle.cuh +++ b/include/neural-graphics-primitives/triangle.cuh @@ -20,11 +20,11 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { struct Triangle { NGP_HOST_DEVICE vec3 sample_uniform_position(const vec2& sample) const { - float sqrt_x = std::sqrt(sample.x); + float sqrt_x = sqrt(sample.x); float factor0 = 1.0f - sqrt_x; float factor1 = sqrt_x * (1.0f - sample.y); float factor2 = sqrt_x * sample.y; @@ -52,7 +52,7 @@ struct Triangle { float v = d * dot(q, v1v0); float t = d * -dot(n, rov0); if (u < 0.0f || u > 1.0f || v < 0.0f || (u+v) > 1.0f || t < 0.0f) { - t = std::numeric_limits::max(); // No intersection + t = std::numeric_limits::max(); } return t; } @@ -74,10 +74,10 @@ struct Triangle { (sign(dot(cross(v21, nor), p1)) + sign(dot(cross(v32, nor), p2)) + sign(dot(cross(v13, nor), p3)) < 2.0f) ? // 3 edges - std::min({ - length2(v21 * tcnn::clamp(dot(v21, p1) / length2(v21), 0.0f, 1.0f)-p1), - length2(v32 * tcnn::clamp(dot(v32, p2) / length2(v32), 0.0f, 1.0f)-p2), - length2(v13 * tcnn::clamp(dot(v13, p3) / length2(v13), 0.0f, 1.0f)-p3), + min(vec3{ + length2(v21 * clamp(dot(v21, p1) / length2(v21), 0.0f, 1.0f)-p1), + length2(v32 * clamp(dot(v32, p2) / length2(v32), 0.0f, 1.0f)-p2), + length2(v13 * clamp(dot(v13, p3) / length2(v13), 0.0f, 1.0f)-p3), }) : // 1 face @@ -85,7 +85,7 @@ struct Triangle { } NGP_HOST_DEVICE float distance(const vec3& pos) const { - return std::sqrt(distance_sq(pos)); + return sqrt(distance_sq(pos)); } NGP_HOST_DEVICE bool point_in_triangle(const vec3& p) const { @@ -116,7 +116,7 @@ struct Triangle { NGP_HOST_DEVICE vec3 closest_point_to_line(const vec3& a, const vec3& b, const vec3& c) const { float t = dot(c - a, b - a) / dot(b - a, b - a); - t = std::max(std::min(t, 1.0f), 0.0f); + t = max(min(t, 1.0f), 0.0f); return a + t * (b - a); } @@ -135,7 +135,7 @@ struct Triangle { float mag2 = length2(point - c2); float mag3 = length2(point - c3); - float min = std::min({mag1, mag2, mag3}); + float min = tcnn::min(vec3{mag1, mag2, mag3}); if (min == mag1) { return c1; @@ -163,13 +163,4 @@ struct Triangle { vec3 a, b, c; }; -inline std::ostream& operator<<(std::ostream& os, const ngp::Triangle& triangle) { - os << "["; - os << "a=[" << triangle.a.x << "," << triangle.a.y << "," << triangle.a.z << "], "; - os << "b=[" << triangle.b.x << "," << triangle.b.y << "," << triangle.b.z << "], "; - os << "c=[" << triangle.c.x << "," << triangle.c.y << "," << triangle.c.z << "]"; - os << "]"; - return os; } - -NGP_NAMESPACE_END diff --git a/include/neural-graphics-primitives/triangle_bvh.cuh b/include/neural-graphics-primitives/triangle_bvh.cuh index c859faa4c..814db7bc4 100644 --- a/include/neural-graphics-primitives/triangle_bvh.cuh +++ b/include/neural-graphics-primitives/triangle_bvh.cuh @@ -23,7 +23,7 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { struct TriangleBvhNode { BoundingBox bb; @@ -65,7 +65,7 @@ public: virtual void ray_trace_gpu(uint32_t n_elements, vec3* gpu_positions, vec3* gpu_directions, const Triangle* gpu_triangles, cudaStream_t stream) = 0; virtual bool touches_triangle(const BoundingBox& bb, const Triangle* __restrict__ triangles) const = 0; virtual void build(std::vector& triangles, uint32_t n_primitives_per_leaf) = 0; - virtual void build_optix(const tcnn::GPUMemory& triangles, cudaStream_t stream) = 0; + virtual void build_optix(const GPUMemory& triangles, cudaStream_t stream) = 0; static std::unique_ptr make(); @@ -75,8 +75,8 @@ public: protected: std::vector m_nodes; - tcnn::GPUMemory m_nodes_gpu; + GPUMemory m_nodes_gpu; TriangleBvh() {}; }; -NGP_NAMESPACE_END +} diff --git a/include/neural-graphics-primitives/triangle_octree.cuh b/include/neural-graphics-primitives/triangle_octree.cuh index 7180a44da..0fddc4039 100644 --- a/include/neural-graphics-primitives/triangle_octree.cuh +++ b/include/neural-graphics-primitives/triangle_octree.cuh @@ -23,8 +23,8 @@ namespace std { template<> - struct less { - bool operator()(const u16vec4& a, const u16vec4& b) const { + struct less { + bool operator()(const tcnn::u16vec4& a, const tcnn::u16vec4& b) const { for(size_t i = 0; i < 4; ++i) { if (a[i] < b[i]) return true; if (a[i] > b[i]) return false; @@ -34,14 +34,14 @@ namespace std { }; template <> - struct hash { - size_t operator()(const u16vec4& x) const { + struct hash { + size_t operator()(const tcnn::u16vec4& x) const { return (size_t)x.x * 73856093 + (size_t)x.y * 19349663 + (size_t)x.z * 83492791 + (size_t)x.w * 25165843; } }; } -NGP_NAMESPACE_BEGIN +namespace ngp { struct TriangleOctreeNode { int children[8]; @@ -180,7 +180,7 @@ public: } }; - generate_dual_coords(m_dual_nodes[0], 0, {0, 0, 0}); + generate_dual_coords(m_dual_nodes[0], 0, {(uint16_t)0, (uint16_t)0, (uint16_t)0}); for (auto& node : m_nodes) { for (uint32_t i = 0; i < 8; ++i) { auto child_idx = node.children[i]; @@ -349,11 +349,11 @@ private: std::vector m_nodes; std::vector m_dual_nodes; - tcnn::GPUMemory m_nodes_gpu; - tcnn::GPUMemory m_dual_nodes_gpu; + GPUMemory m_nodes_gpu; + GPUMemory m_dual_nodes_gpu; uint32_t m_n_vertices = 0; uint32_t m_depth = 0; }; -NGP_NAMESPACE_END +} diff --git a/scripts/run.py b/scripts/run.py index f60e77e28..263626299 100644 --- a/scripts/run.py +++ b/scripts/run.py @@ -204,6 +204,7 @@ def get_scene(scene): tqdm_last_update = now if args.save_snapshot: + os.makedirs(os.path.dirname(args.save_snapshot), exist_ok=True) testbed.save_snapshot(args.save_snapshot, False) if args.test_transforms: @@ -280,7 +281,7 @@ def get_scene(scene): print(args.screenshot_frames) for idx in args.screenshot_frames: f = ref_transforms["frames"][int(idx)] - cam_matrix = f["transform_matrix"] + cam_matrix = f.get("transform_matrix", f["transform_matrix_start"]) testbed.set_nerf_camera_matrix(np.matrix(cam_matrix)[:-1,:]) outname = os.path.join(args.screenshot_dir, os.path.basename(f["file_path"])) diff --git a/src/camera_path.cu b/src/camera_path.cu index 14ea30001..02e33a57d 100644 --- a/src/camera_path.cu +++ b/src/camera_path.cu @@ -26,7 +26,7 @@ using namespace nlohmann; -NGP_NAMESPACE_BEGIN +namespace ngp { CameraKeyframe lerp(const CameraKeyframe& p0, const CameraKeyframe& p1, float t, float t0, float t1) { t = (t - t0) / (t1 - t0); @@ -157,7 +157,7 @@ int CameraPath::imgui(char path_filename_buf[1024], float frame_milliseconds, ma if (ImGui::Button("Load")) { try { load(path_filename_buf, first_xform); - } catch (std::exception& e) { + } catch (const std::exception& e) { ImGui::OpenPopup("Camera path load error"); camera_path_load_error_string = std::string{"Failed to load camera path: "} + e.what(); } @@ -259,7 +259,7 @@ int CameraPath::imgui(char path_filename_buf[1024], float frame_milliseconds, ma } bool debug_project(const mat4& proj, vec3 p, ImVec2& o) { - vec4 ph(p, 1.0f); + vec4 ph{p.x, p.y, p.z, 1.0f}; vec4 pa = proj * ph; if (pa.w <= 0.f) { return false; @@ -323,12 +323,12 @@ bool CameraPath::imgui_viz(ImDrawList* list, mat4 &view2proj, mat4 &world2proj, bool changed = false; // float flx = focal.x; float fly = focal.y; - mat4 view2proj_guizmo = transpose(mat4( + mat4 view2proj_guizmo = transpose(mat4{ fly * 2.0f / aspect, 0.0f, 0.0f, 0.0f, 0.0f, -fly * 2.0f, 0.0f, 0.0f, 0.0f, 0.0f, (zfar + znear) / (zfar - znear), -(2.0f * zfar * znear) / (zfar - znear), - 0.0f, 0.0f, 1.0f, 0.0f - )); + 0.0f, 0.0f, 1.0f, 0.0f, + }); if (!update_cam_from_path) { ImDrawList* list = ImGui::GetForegroundDrawList(); @@ -350,7 +350,7 @@ bool CameraPath::imgui_viz(ImDrawList* list, mat4 &view2proj, mat4 &world2proj, int i0 = cur_cam_i; while (i0 > 0 && keyframes[cur_cam_i].same_pos_as(keyframes[i0 - 1])) i0--; int i1 = cur_cam_i; while (i1 < keyframes.size() - 1 && keyframes[cur_cam_i].same_pos_as(keyframes[i1 + 1])) i1++; for (int i = i0; i <= i1; ++i) { - keyframes[i].T = matrix[3].xyz; + keyframes[i].T = matrix[3].xyz(); keyframes[i].R = quat(mat3(matrix)); } changed=true; @@ -375,4 +375,4 @@ bool CameraPath::imgui_viz(ImDrawList* list, mat4 &view2proj, mat4 &world2proj, } #endif //NGP_GUI -NGP_NAMESPACE_END +} diff --git a/src/common_device.cu b/src/common_device.cu deleted file mode 100644 index c430ab617..000000000 --- a/src/common_device.cu +++ /dev/null @@ -1,247 +0,0 @@ -/* - * Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. - * - * NVIDIA CORPORATION and its licensors retain all intellectual property - * and proprietary rights in and to this software, related documentation - * and any modifications thereto. Any use, reproduction, disclosure or - * distribution of this software and related documentation without an express - * license agreement from NVIDIA CORPORATION is strictly prohibited. - */ - -/** @file common_device.cu - * @author Thomas Müller, NVIDIA - */ - -#include -#include - -// #include - -#include - -using namespace tcnn; - -NGP_NAMESPACE_BEGIN - - -mat3 so3_log(const mat3& m) { - float tr = tcnn::clamp(m[0][0] + m[1][1] + m[2][2], -0.99999f, 1.0f); - float radians = acosf((tr - 1.0f) / 2.0f); - return radians / sqrt((1.0f + tr) * (3.0f - tr)) * (m - transpose(m)); -} - -mat3 so3_exp(const mat3& m) { - vec3 axis = {-m[2][1], m[2][0], -m[1][0]}; - float radians_sq = length2(axis); - if (radians_sq == 0.0f) { - return mat3(1.0f); - } - - float radians = sqrt(radians_sq); - return mat3(1.0f) + (sin(radians) / radians) * m + ((1.0f - cos(radians)) / radians_sq) * (m * m); -} - -mat4x3 se3_log(const mat4x3& m) { - mat3 omega = so3_log(mat3(m)); - vec3 axis = {-omega[2][1], omega[2][0], -omega[1][0]}; - float radians_sq = length2(axis); - mat3 inv_trans = mat3(1.0f); - if (radians_sq > 0.0f) { - float radians = sqrt(radians_sq); - inv_trans += -0.5f * omega + ((1.0f - 0.5f * radians * cos(0.5f * radians) / sin(0.5f * radians)) / radians_sq) * (omega * omega); - } - - return mat4x3(omega[0], omega[1], omega[2], inv_trans * m[3]); -} - -mat4x3 se3_exp(const mat4x3& m) { - mat3 omega{m}; - vec3 axis = {-omega[2][1], omega[2][0], -omega[1][0]}; - float radians_sq = length2(axis); - mat3 trans = mat3(1.0f); - if (radians_sq > 0.0f) { - float radians = sqrt(radians_sq); - trans += ((1.0f - cos(radians)) / radians_sq) * omega + ((radians - sin(radians)) / (radians * radians_sq)) * (omega * omega); - } - - mat3 rot = so3_exp(omega); - return mat4x3(rot[0], rot[1], rot[2], trans * m[3]); -} - -mat4 se3_log(const mat4& m) { - mat4 result = mat4(se3_log(mat4x3(m))); - result[3][3] = 0.0f; - return result; -} - -mat4 se3_exp(const mat4& m) { - return mat4(se3_exp(mat4x3(m))); -} - -float frobenius_norm(const mat4& m) { - return sqrt(length2(m[0]) + length2(m[1]) + length2(m[2]) + length2(m[3])); -} - -mat4 mat_sqrt(const mat4& m, float eps = 1e-10f) { - mat4 X = m, Y = mat4(1.0f); - for (uint32_t i = 0; i < 32; ++i) { - if (frobenius_norm(X * X - m) < eps) { - return X; - } - - mat4 iX = inverse(X); - X = 0.5f * (X + inverse(Y)); - Y = 0.5f * (Y + iX); - } - - return X; -} - -mat4 mat_log_taylor(const mat4& m, uint32_t n_iters) { - mat4 result = mat4(0.0f); - mat4 cur = m - mat4(1.0f); - float sign = 1.0f; - for (uint32_t i = 1; i < n_iters; ++i) { - result += (sign / (float)i) * cur; - cur *= (m - mat4(1.0f)); - sign = -sign; - } - return result; -} - -mat4 mat_log_hawkins(const mat4& m, float eps = 1e-10f) { - mat4 A = m - mat4(1.0f), Z = A, X = A; - for (uint32_t i = 2; i < 32; ++i) { - if (frobenius_norm(Z) < eps) { - return X; - } - - Z = Z * A; - X += (1.0f / (float)i) * Z; - } - - return X; -} - -mat4 mat_exp_power(const mat4& m, uint32_t n_iters) { - mat4 result = mat4(1.0f); - mat4 cur = m; - float div = 1.0f; - for (uint32_t i = 1; i < n_iters; ++i) { - div *= (float)i; - result += (1.0f / div) * cur; - cur *= m; - } - return result; -} - -mat4 mat_exp_pade(const mat4& m) { - // Pade approximation with scaling; same as Matlab. - // Pseudocode translated from Hawkins and Grimm [2007] - mat4 X = mat4(1.0f), D = mat4(1.0f), N = mat4(1.0f); - float c = 1.0f; - constexpr uint32_t q = 6; // Matlab's default when using this algorithm - - float s = -1.0f; - for (uint32_t k = 1; k <= q; ++k) { - c = c * (q - k + 1) / (k * (2 * q - k + 1)); - X = m * X; - auto cX = c * X; - N = N + cX; - D = D + s * cX; - s = -s; - } - - return inverse(D) * N; -} - -mat4 mat_log(const mat4& m) { - mat4 result(m); - - uint32_t j = 0; - for (; j < 32; ++j) { - if (frobenius_norm(result - mat4(1.0f)) < 1e-5f) { - break; - } - - result = mat_sqrt(result); - } - - result = mat_log_hawkins(result); - return scalbnf(1.0f, j) * result; -} - -mat4 mat_exp(const mat4& m) { - uint32_t N_SQUARING = max(0, 1 + (int)floor(log2(frobenius_norm(m)))); - - mat4 result = scalbnf(1.0f, -N_SQUARING) * m; - result = mat_exp_pade(result); - - for (uint32_t i = 0; i < N_SQUARING; ++i) { - result *= result; - } - - return result; -} - -mat3 orthogonalize(const mat3& m) { - return mat3{ - 0.5f * (3.0f - dot(m[0], m[0])) * m[0], - 0.5f * (3.0f - dot(m[1], m[1])) * m[1], - 0.5f * (3.0f - dot(m[2], m[2])) * m[2], - }; -} - -mat4x3 camera_log_lerp(const mat4x3& a, const mat4x3& b, float t) { - return mat_exp(mat_log(mat4(b) * inverse(mat4(a))) * t) * mat4(a); -} - -mat4x3 camera_slerp(const mat4x3& a, const mat4x3& b, float t) { - mat3 rot = slerp(a, b, t); - return {rot[0], rot[1], rot[2], mix(a[3], b[3], t)}; -} - -GPUMemory load_exr_gpu(const fs::path& path, int* width, int* height) { - float* out; // width * height * RGBA - load_exr(&out, width, height, path.str().c_str()); - ScopeGuard mem_guard{[&]() { free(out); }}; - - GPUMemory result((*width) * (*height) * 4); - result.copy_from_host(out); - return result; -} - -GPUMemory load_stbi_gpu(const fs::path& path, int* width, int* height) { - bool is_hdr = is_hdr_stbi(path); - - void* data; // width * height * RGBA - int comp; - if (is_hdr) { - data = load_stbi_float(path, width, height, &comp, 4); - } else { - data = load_stbi(path, width, height, &comp, 4); - } - - if (!data) { - throw std::runtime_error{std::string{stbi_failure_reason()}}; - } - - ScopeGuard mem_guard{[&]() { stbi_image_free(data); }}; - - if (*width == 0 || *height == 0) { - throw std::runtime_error{"Image has zero pixels."}; - } - - GPUMemory result((*width) * (*height) * 4); - if (is_hdr) { - result.copy_from_host((float*)data); - } else { - GPUMemory bytes((*width) * (*height) * 4); - bytes.copy_from_host((uint8_t*)data); - linear_kernel(from_rgba32, 0, nullptr, (*width) * (*height), bytes.data(), result.data(), false, false, 0); - } - - return result; -} - -NGP_NAMESPACE_END diff --git a/src/common.cu b/src/common_host.cu similarity index 76% rename from src/common.cu rename to src/common_host.cu index b6c2ec6ab..2cacd34e8 100644 --- a/src/common.cu +++ b/src/common_host.cu @@ -8,11 +8,15 @@ * license agreement from NVIDIA CORPORATION is strictly prohibited. */ -/** @file common_device.cu +/** @file common_host.cu * @author Thomas Müller, NVIDIA */ -#include +#include +#include +#include +#include +#include #include @@ -21,7 +25,7 @@ #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_WRITE_IMPLEMENTATION -#ifdef __NVCC__ +#ifdef __CUDACC__ # ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ # pragma nv_diag_suppress 550 # else @@ -30,7 +34,7 @@ #endif #include #include -#ifdef __NVCC__ +#ifdef __CUDACC__ # ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ # pragma nv_diag_default 550 # else @@ -39,7 +43,7 @@ #endif #ifdef _WIN32 -# include +# include #else # include # include @@ -50,9 +54,7 @@ #undef near #undef far -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { bool is_wsl() { #ifdef _WIN32 @@ -95,7 +97,7 @@ std::wstring native_string(const fs::path& path) { return path.wstr(); } std::string native_string(const fs::path& path) { return path.str(); } #endif -fs::path get_executable_dir() { +fs::path discover_executable_dir() { #ifdef _WIN32 WCHAR path[1024]; if (GetModuleFileNameW(NULL, path, 1024) == 0) { @@ -112,8 +114,8 @@ fs::path get_executable_dir() { #endif } -fs::path get_root_dir() { - auto executable_dir = get_executable_dir(); +fs::path discover_root_dir() { + auto executable_dir = discover_executable_dir(); fs::path exists_in_root_dir = "scripts"; for (const auto& candidate : { fs::path{"."}/exists_in_root_dir, @@ -248,4 +250,64 @@ FILE* native_fopen(const fs::path& path, const char* mode) { #endif } -NGP_NAMESPACE_END +GPUMemory load_exr_gpu(const fs::path& path, int* width, int* height) { + float* out; // width * height * RGBA + load_exr(&out, width, height, path.str().c_str()); + ScopeGuard mem_guard{[&]() { free(out); }}; + + GPUMemory result((*width) * (*height) * 4); + result.copy_from_host(out); + return result; +} + +GPUMemory load_stbi_gpu(const fs::path& path, int* width, int* height) { + bool is_hdr = is_hdr_stbi(path); + + void* data; // width * height * RGBA + int comp; + if (is_hdr) { + data = load_stbi_float(path, width, height, &comp, 4); + } else { + data = load_stbi(path, width, height, &comp, 4); + } + + if (!data) { + throw std::runtime_error{std::string{stbi_failure_reason()}}; + } + + ScopeGuard mem_guard{[&]() { stbi_image_free(data); }}; + + if (*width == 0 || *height == 0) { + throw std::runtime_error{"Image has zero pixels."}; + } + + GPUMemory result((*width) * (*height) * 4); + if (is_hdr) { + result.copy_from_host((float*)data); + } else { + GPUMemory bytes((*width) * (*height) * 4); + bytes.copy_from_host((uint8_t*)data); + linear_kernel(from_rgba32, 0, nullptr, (*width) * (*height), bytes.data(), result.data(), false, false, 0); + } + + return result; +} + +std::ostream& operator<<(std::ostream& os, const BoundingBox& bb) { + os << "["; + os << "min=[" << bb.min.x << "," << bb.min.y << "," << bb.min.z << "], "; + os << "max=[" << bb.max.x << "," << bb.max.y << "," << bb.max.z << "]"; + os << "]"; + return os; +} + +std::ostream& operator<<(std::ostream& os, const Triangle& triangle) { + os << "["; + os << "a=[" << triangle.a.x << "," << triangle.a.y << "," << triangle.a.z << "], "; + os << "b=[" << triangle.b.x << "," << triangle.b.y << "," << triangle.b.z << "], "; + os << "c=[" << triangle.c.x << "," << triangle.c.y << "," << triangle.c.z << "]"; + os << "]"; + return os; +} + +} diff --git a/src/dlss.cu b/src/dlss.cu index e498cce4f..d4c55428c 100644 --- a/src/dlss.cu +++ b/src/dlss.cu @@ -12,10 +12,10 @@ * @author Thomas Müller, NVIDIA */ -#include +#include #include -#include +#include #include @@ -36,7 +36,7 @@ static_assert(false, "DLSS can only be compiled when both Vulkan and GUI support // NGX's macro `NVSDK_NGX_FAILED` results in a change of sign, which does not affect correctness. // Thus, suppress the corresponding warning. -#ifdef __NVCC__ +#ifdef __CUDACC__ # ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ # pragma nv_diag_suppress = integer_sign_change # else @@ -51,9 +51,7 @@ static_assert(false, "DLSS can only be compiled when both Vulkan and GUI support #include #include -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { extern std::atomic g_total_n_bytes_allocated; @@ -314,7 +312,7 @@ public: }; cudaDeviceProp cuda_device_prop; - CUDA_CHECK_THROW(cudaGetDeviceProperties(&cuda_device_prop, tcnn::cuda_device())); + CUDA_CHECK_THROW(cudaGetDeviceProperties(&cuda_device_prop, cuda_device())); auto is_same_as_cuda_device = [&](VkPhysicalDevice device) { VkPhysicalDeviceIDProperties physical_device_id_properties = {}; @@ -1222,4 +1220,4 @@ std::unique_ptr VulkanAndNgx::init_dlss(const ivec2& out_resolution) { return std::make_unique(shared_from_this(), out_resolution); } -NGP_NAMESPACE_END +} diff --git a/src/main.cu b/src/main.cu index ac79bd362..3494f94ff 100644 --- a/src/main.cu +++ b/src/main.cu @@ -23,9 +23,8 @@ using namespace args; using namespace ngp; using namespace std; -using namespace tcnn; -NGP_NAMESPACE_BEGIN +namespace ngp { int main_func(const std::vector& arguments) { ArgumentParser parser{ @@ -191,7 +190,7 @@ int main_func(const std::vector& arguments) { return 0; } -NGP_NAMESPACE_END +} #ifdef _WIN32 int wmain(int argc, wchar_t* argv[]) { diff --git a/src/marching_cubes.cu b/src/marching_cubes.cu index daff134fd..67c934089 100644 --- a/src/marching_cubes.cu +++ b/src/marching_cubes.cu @@ -14,7 +14,7 @@ #include #include -#include +#include #include // helpers to generate random values, directions #include @@ -35,13 +35,11 @@ #include -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { ivec3 get_marching_cubes_res(uint32_t res_1d, const BoundingBox &aabb) { - float scale = res_1d / compMax(aabb.max - aabb.min); - ivec3 res3d = (aabb.max - aabb.min) * scale + vec3(0.5f); + float scale = res_1d / max(aabb.max - aabb.min); + ivec3 res3d = (aabb.max - aabb.min) * scale + 0.5f; res3d.x = next_multiple((unsigned int)res3d.x, 16u); res3d.y = next_multiple((unsigned int)res3d.y, 16u); res3d.z = next_multiple((unsigned int)res3d.z, 16u); @@ -265,7 +263,7 @@ __global__ void gen_vertices(BoundingBox render_aabb, mat3 render_aabb_to_local, uint32_t y = blockIdx.y * blockDim.y + threadIdx.y; uint32_t z = blockIdx.z * blockDim.z + threadIdx.z; if (x>=res_3d.x || y>=res_3d.y || z>=res_3d.z) return; - vec3 scale = (render_aabb.max - render_aabb.min) / vec3(res_3d - ivec3(1)); + vec3 scale = (render_aabb.max - render_aabb.min) / vec3(res_3d - 1); vec3 offset=render_aabb.min; uint32_t res2=res_3d.x*res_3d.y; uint32_t res3=res_3d.x*res_3d.y*res_3d.z; @@ -352,7 +350,7 @@ __global__ void compute_centroids(uint32_t num_verts, vec3* centroids_out, const if (i>=num_verts) return; vec4 p = verts_in[i]; if (p.w<=0.f) return; - vec3 c = verts_in[i].xyz * (1.f / p.w); + vec3 c = verts_in[i].xyz() * (1.f / p.w); centroids_out[i]=c; } @@ -699,7 +697,7 @@ __global__ void gen_faces(ivec3 res_3d, const float* __restrict__ density, const } } -void compute_mesh_1ring(const tcnn::GPUMemory &verts, const tcnn::GPUMemory &indices, tcnn::GPUMemory &output_pos, tcnn::GPUMemory &output_normals) { // computes the average of the 1ring of all verts, as homogenous coordinates +void compute_mesh_1ring(const GPUMemory &verts, const GPUMemory &indices, GPUMemory &output_pos, GPUMemory &output_normals) { // computes the average of the 1ring of all verts, as homogenous coordinates output_pos.resize(verts.size()); output_pos.memset(0); output_normals.resize(verts.size()); @@ -731,7 +729,7 @@ __global__ void compute_mesh_opt_gradients_kernel( p.w = 1.f; } - vec3 target = p.xyz * (1.0f / p.w); + vec3 target = p.xyz() * (1.0f / p.w); vec3 smoothing_grad = src - target; // negative... vec3 input_gradient = *(const vec3 *)(input_gradients + i * input_gradient_width); @@ -743,9 +741,9 @@ __global__ void compute_mesh_opt_gradients_kernel( void compute_mesh_opt_gradients( float thresh, - const tcnn::GPUMemory& verts, - const tcnn::GPUMemory& normals, - const tcnn::GPUMemory& verts_smoothed, + const GPUMemory& verts, + const GPUMemory& normals, + const GPUMemory& verts_smoothed, const network_precision_t* densities, uint32_t input_gradients_width, const float* input_gradients, @@ -773,7 +771,7 @@ void compute_mesh_opt_gradients( ); } -void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 render_aabb_to_local, ivec3 res_3d, float thresh, const tcnn::GPUMemory& density, tcnn::GPUMemory& verts_out, tcnn::GPUMemory& indices_out) { +void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 render_aabb_to_local, ivec3 res_3d, float thresh, const GPUMemory& density, GPUMemory& verts_out, GPUMemory& indices_out) { GPUMemory counters; counters.enlarge(4); @@ -794,10 +792,11 @@ void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 rende counters.copy_to_host(cpucounters); tlog::info() << "#vertices=" << cpucounters[0] << " #triangles=" << (cpucounters[1]/3); - uint32_t n_verts=(cpucounters[0]+127)&~127; // round for later nn stuff + uint32_t n_verts = next_multiple(cpucounters[0], BATCH_SIZE_GRANULARITY); // round for later nn stuff verts_out.resize(n_verts); verts_out.memset(0); indices_out.resize(cpucounters[1]); + // actually generate verts gen_vertices<<>>(render_aabb, render_aabb_to_local, res_3d, density.data(), vertex_grid, verts_out.data(), thresh, counters.data()+2); gen_faces<<>>(res_3d, density.data(), vertex_grid, indices_out.data(), thresh, counters.data()+2); @@ -825,7 +824,7 @@ void save_mesh( // Replace invalid values with reasonable defaults for (size_t i = 0; i < cpuverts.size(); ++i) { if (!all(isfinite(cpuverts[i]))) cpuverts[i] = vec3(0.0f); - if (!all(isfinite(cpunormals[i]))) cpunormals[i] = vec3(0.0f, 1.0f, 0.0f); + if (!all(isfinite(cpunormals[i]))) cpunormals[i] = vec3{0.0f, 1.0f, 0.0f}; if (!all(isfinite(cpucolors[i]))) cpucolors[i] = vec3(0.0f); } @@ -895,7 +894,7 @@ void save_mesh( vec3 p = (cpuverts[i]-nerf_offset)/nerf_scale; vec3 c = cpucolors[i]; vec3 n = normalize(cpunormals[i]); - unsigned char c8[3] = {(unsigned char)tcnn::clamp(c.x*255.f,0.f,255.f),(unsigned char)tcnn::clamp(c.y*255.f,0.f,255.f),(unsigned char)tcnn::clamp(c.z*255.f,0.f,255.f)}; + unsigned char c8[3] = {(unsigned char)clamp(c.x*255.f,0.f,255.f),(unsigned char)clamp(c.y*255.f,0.f,255.f),(unsigned char)clamp(c.z*255.f,0.f,255.f)}; fprintf(f, "%0.5f %0.5f %0.5f %0.3f %0.3f %0.3f %d %d %d\n", p.x, p.y, p.z, n.x, n.y, n.z, c8[0], c8[1], c8[2]); } @@ -911,7 +910,7 @@ void save_mesh( for (size_t i = 0; i < cpuverts.size(); ++i) { vec3 p = (cpuverts[i]-nerf_offset)/nerf_scale; vec3 c = cpucolors[i]; - fprintf(f, "v %0.5f %0.5f %0.5f %0.3f %0.3f %0.3f\n", p.x, p.y, p.z, tcnn::clamp(c.x, 0.f, 1.f), tcnn::clamp(c.y, 0.f, 1.f), tcnn::clamp(c.z, 0.f, 1.f)); + fprintf(f, "v %0.5f %0.5f %0.5f %0.3f %0.3f %0.3f\n", p.x, p.y, p.z, clamp(c.x, 0.f, 1.f), clamp(c.y, 0.f, 1.f), clamp(c.z, 0.f, 1.f)); } for (auto &v: cpunormals) { @@ -1012,9 +1011,9 @@ void save_density_grid_to_png(const GPUMemory& density, const fs::path& p int z = (u / res3d.x) + (v / res3d.y) * nacross; if (z < res3d.z) { if (swap_y_z) { - *dst++ = (uint8_t)tcnn::clamp((density_cpu[x + z*res3d.x + y*res3d.x*res3d.z]-thresh)*density_scale + 128.5f, 0.f, 255.f); + *dst++ = (uint8_t)clamp((density_cpu[x + z*res3d.x + y*res3d.x*res3d.z]-thresh)*density_scale + 128.5f, 0.f, 255.f); } else { - *dst++ = (uint8_t)tcnn::clamp((density_cpu[x + (res3d.y-1-y)*res3d.x + z*res3d.x*res3d.y]-thresh)*density_scale + 128.5f, 0.f, 255.f); + *dst++ = (uint8_t)clamp((density_cpu[x + (res3d.y-1-y)*res3d.x + z*res3d.x*res3d.y]-thresh)*density_scale + 128.5f, 0.f, 255.f); } } else { *dst++ = 0; @@ -1057,10 +1056,10 @@ void save_rgba_grid_to_png_sequence(const GPUMemory& rgba, const fs::path& for (int y = 0; y < h; ++y) { for (int x = 0; x < w; ++x) { size_t i = swap_y_z ? (x + z*res3d.x + y*res3d.x*res3d.z) : (x + (res3d.y-1-y)*res3d.x + z*res3d.x*res3d.y); - *dst++ = (uint8_t)tcnn::clamp(rgba_cpu[i].x * 255.f, 0.f, 255.f); - *dst++ = (uint8_t)tcnn::clamp(rgba_cpu[i].y * 255.f, 0.f, 255.f); - *dst++ = (uint8_t)tcnn::clamp(rgba_cpu[i].z * 255.f, 0.f, 255.f); - *dst++ = (uint8_t)tcnn::clamp(rgba_cpu[i].w * 255.f, 0.f, 255.f); + *dst++ = (uint8_t)clamp(rgba_cpu[i].x * 255.f, 0.f, 255.f); + *dst++ = (uint8_t)clamp(rgba_cpu[i].y * 255.f, 0.f, 255.f); + *dst++ = (uint8_t)clamp(rgba_cpu[i].z * 255.f, 0.f, 255.f); + *dst++ = (uint8_t)clamp(rgba_cpu[i].w * 255.f, 0.f, 255.f); } } @@ -1109,4 +1108,4 @@ void save_rgba_grid_to_raw_file(const GPUMemory& rgba, const fs::path& pat tlog::success() << "Wrote RGBA raw file to " << actual_path.str(); } -NGP_NAMESPACE_END +} diff --git a/src/nerf_loader.cu b/src/nerf_loader.cu index a61da2876..47fe4f738 100644 --- a/src/nerf_loader.cu +++ b/src/nerf_loader.cu @@ -31,14 +31,12 @@ #include #include #include -#include #include #include -using namespace tcnn; using namespace std::literals; -NGP_NAMESPACE_BEGIN +namespace ngp { __global__ void convert_rgba32(const uint64_t num_pixels, const uint8_t* __restrict__ pixels, uint8_t* __restrict__ out, bool white_2_transparent = false, bool black_2_transparent = false, uint32_t mask_color = 0) { const uint64_t i = threadIdx.x + blockIdx.x * blockDim.x; @@ -168,8 +166,8 @@ NerfDataset create_empty_nerf_dataset(size_t n_images, int aabb_scale, bool is_h result.is_hdr = is_hdr; result.paths = std::vector(n_images, ""); for (size_t i = 0; i < n_images; ++i) { - result.xforms[i].start = mat4x3(1.0f); - result.xforms[i].end = mat4x3(1.0f); + result.xforms[i].start = mat4x3::identity(); + result.xforms[i].end = mat4x3::identity(); } return result; } @@ -353,6 +351,9 @@ NerfDataset load_nerf(const std::vector& jsonpaths, float sharpen_amou for (auto&& frame : frames) { // Compatibility with Windows paths on Linux. (Breaks linux filenames with "\\" in them, which is acceptable for us.) frame["file_path"] = replace_all(frame["file_path"], "\\", "/"); + if (frame.contains("depth_path")) { + frame["depth_path"] = replace_all(frame["depth_path"], "\\", "/"); + } } if (json.contains("n_frames")) { @@ -527,7 +528,7 @@ NerfDataset load_nerf(const std::vector& jsonpaths, float sharpen_amou result.up[2] = float(json["up"][0]); } - if (json.contains("envmap") && !any(equal(result.envmap_resolution, ivec2(0)))) { + if (json.contains("envmap") && product(result.envmap_resolution) > 0) { fs::path envmap_path = resolve_path(base_path, json["envmap"]); if (!envmap_path.exists()) { throw std::runtime_error{fmt::format("Environment map {} does not exist.", envmap_path.str())}; @@ -587,7 +588,7 @@ NerfDataset load_nerf(const std::vector& jsonpaths, float sharpen_amou } tlog::success() << "Alpha loaded from " << alphapath; - for (int i = 0; i < compMul(dst.res); ++i) { + for (int i = 0; i < product(dst.res); ++i) { img[i*4+3] = (uint8_t)(255.0f*srgb_to_linear(alpha_img[i*4]*(1.f/255.f))); // copy red channel of alpha to alpha.png to our alpha channel } } @@ -606,7 +607,7 @@ NerfDataset load_nerf(const std::vector& jsonpaths, float sharpen_amou } dst.mask_color = 0x00FF00FF; // HOT PINK - for (int i = 0; i < compMul(dst.res); ++i) { + for (int i = 0; i < product(dst.res); ++i) { if (mask_img[i*4] != 0 || mask_img[i*4+1] != 0 || mask_img[i*4+2] != 0) { *(uint32_t*)&img[i*4] = dst.mask_color; } @@ -638,7 +639,7 @@ NerfDataset load_nerf(const std::vector& jsonpaths, float sharpen_amou fs::path rayspath = path.parent_path() / fmt::format("rays_{}.dat", path.basename()); if (enable_ray_loading && rayspath.exists()) { - uint32_t n_pixels = compMul(dst.res); + uint32_t n_pixels = product(dst.res); dst.rays = (Ray*)malloc(n_pixels * sizeof(Ray)); std::ifstream rays_file{native_string(rayspath), std::ios::binary}; @@ -664,11 +665,11 @@ NerfDataset load_nerf(const std::vector& jsonpaths, float sharpen_amou nlohmann::json& jsonmatrix_end = frame.contains("transform_matrix_end") ? frame["transform_matrix_end"] : jsonmatrix_start; if (frame.contains("driver_parameters")) { - vec3 light_dir( + vec3 light_dir{ frame["driver_parameters"].value("LightX", 0.f), frame["driver_parameters"].value("LightY", 0.f), frame["driver_parameters"].value("LightZ", 0.f) - ); + }; result.metadata[i_img].light_dir = result.nerf_direction_to_ngp(normalize(light_dir)); result.has_light_dirs = true; result.n_extra_learnable_dims = 0; @@ -746,7 +747,7 @@ void NerfDataset::set_training_image(int frame_idx, const ivec2& image_resolutio throw std::runtime_error{"NerfDataset::set_training_image: invalid frame index"}; } - size_t n_pixels = compMul(image_resolution); + size_t n_pixels = product(image_resolution); size_t img_size = n_pixels * 4; // 4 channels size_t image_type_stride = image_type_size(image_type); // copy to gpu if we need to do a conversion @@ -800,7 +801,7 @@ void NerfDataset::set_training_image(int frame_idx, const ivec2& image_resolutio // apply requested sharpening if (sharpen_amount > 0.f) { if (image_type == EImageDataType::Byte) { - tcnn::GPUMemory images_data_half(img_size * sizeof(__half)); + GPUMemory images_data_half(img_size * sizeof(__half)); linear_kernel(from_rgba32<__half>, 0, nullptr, n_pixels, (uint8_t*)pixels, (__half*)images_data_half.data(), white_transparent, black_transparent, mask_color); pixelmemory[frame_idx] = std::move(images_data_half); dst = pixelmemory[frame_idx].data(); @@ -809,7 +810,7 @@ void NerfDataset::set_training_image(int frame_idx, const ivec2& image_resolutio assert(image_type == EImageDataType::Half || image_type == EImageDataType::Float); - tcnn::GPUMemory images_data_sharpened(img_size * image_type_size(image_type)); + GPUMemory images_data_sharpened(img_size * image_type_size(image_type)); float center_w = 4.f + 1.f / sharpen_amount; // center_w ranges from 5 (strong sharpening) to infinite (no sharpening) if (image_type == EImageDataType::Half) { @@ -862,4 +863,4 @@ void NerfDataset::update_metadata(int first, int last) { CUDA_CHECK_THROW(cudaMemcpy(metadata_gpu.data() + first, metadata.data() + first, n * sizeof(TrainingImageMetadata), cudaMemcpyHostToDevice)); } -NGP_NAMESPACE_END +} diff --git a/src/openxr_hmd.cu b/src/openxr_hmd.cu index 08353b772..a348e317d 100644 --- a/src/openxr_hmd.cu +++ b/src/openxr_hmd.cu @@ -15,8 +15,6 @@ * view, hand, and eye poses, as well as controller inputs. */ -#define NOMINMAX - #include #include #include @@ -38,9 +36,7 @@ #pragma GCC diagnostic ignored "-Wmissing-field-initializers" //TODO: XR struct are uninitiaized apart from their type #endif -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { // function XrEnumStr turns enum into string for printing // uses expansion macro and data provided in openxr_reflection.h @@ -1254,7 +1250,7 @@ void OpenXRHMD::end_frame(FrameInfoPtr frame_info, float znear, float zfar, bool XR_CHECK_THROW(xrEndFrame(m_session, &frame_end_info)); } -NGP_NAMESPACE_END +} #ifdef __GNUC__ #pragma GCC diagnostic pop diff --git a/src/optix/pathescape.cu b/src/optix/pathescape.cu index 8e5b8a2c5..13b711d66 100644 --- a/src/optix/pathescape.cu +++ b/src/optix/pathescape.cu @@ -15,13 +15,12 @@ #include #include + #include #include "pathescape.h" -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { extern "C" { __constant__ PathEscape::Params params; @@ -121,4 +120,4 @@ extern "C" __global__ void __closesthit__ch() { optixSetPayload_0(optixGetPrimitiveIndex()); } -NGP_NAMESPACE_END +} diff --git a/src/optix/pathescape.h b/src/optix/pathescape.h index d75a08352..cbdf9e09b 100644 --- a/src/optix/pathescape.h +++ b/src/optix/pathescape.h @@ -20,7 +20,7 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { struct PathEscape { struct Params { @@ -35,4 +35,4 @@ struct PathEscape { struct HitGroupData {}; }; -NGP_NAMESPACE_END +} diff --git a/src/optix/program.h b/src/optix/program.h index 240c9497b..42302882a 100644 --- a/src/optix/program.h +++ b/src/optix/program.h @@ -14,7 +14,7 @@ #pragma once -NGP_NAMESPACE_BEGIN +namespace ngp { #define OPTIX_CHECK_THROW(x) \ do { \ @@ -34,7 +34,6 @@ NGP_NAMESPACE_BEGIN } \ } while(0) - namespace optix { template struct SbtRecord { @@ -236,4 +235,4 @@ namespace optix { }; } -NGP_NAMESPACE_END +} diff --git a/src/optix/raystab.cu b/src/optix/raystab.cu index 593e02c14..412718a52 100644 --- a/src/optix/raystab.cu +++ b/src/optix/raystab.cu @@ -20,9 +20,7 @@ #include "raystab.h" -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { extern "C" { __constant__ Raystab::Params params; @@ -78,4 +76,4 @@ extern "C" __global__ void __closesthit__ch() { optixSetPayload_0(1); } -NGP_NAMESPACE_END +} diff --git a/src/optix/raystab.h b/src/optix/raystab.h index db8fe99b9..0f11e930b 100644 --- a/src/optix/raystab.h +++ b/src/optix/raystab.h @@ -19,7 +19,7 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { struct Raystab { struct Params { @@ -33,4 +33,4 @@ struct Raystab { struct HitGroupData {}; }; -NGP_NAMESPACE_END +} diff --git a/src/optix/raytrace.cu b/src/optix/raytrace.cu index ea438c302..562748236 100644 --- a/src/optix/raytrace.cu +++ b/src/optix/raytrace.cu @@ -14,11 +14,12 @@ */ #include + #include #include "raytrace.h" -NGP_NAMESPACE_BEGIN +namespace ngp { extern "C" { __constant__ Raytrace::Params params; @@ -70,4 +71,4 @@ extern "C" __global__ void __closesthit__ch() { optixSetPayload_1(__float_as_int(optixGetRayTmax())); } -NGP_NAMESPACE_END +} diff --git a/src/optix/raytrace.h b/src/optix/raytrace.h index e7b406349..eaf43bd3f 100644 --- a/src/optix/raytrace.h +++ b/src/optix/raytrace.h @@ -20,7 +20,7 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { struct Raytrace { struct Params { @@ -35,4 +35,4 @@ struct Raytrace { struct HitGroupData {}; }; -NGP_NAMESPACE_END +} diff --git a/src/python_api.cu b/src/python_api.cu index 0df6c001b..851d7c9c4 100644 --- a/src/python_api.cu +++ b/src/python_api.cu @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include @@ -37,14 +37,10 @@ # include #endif -using namespace tcnn; using namespace nlohmann; namespace py = pybind11; -using namespace pybind11::literals; // to bring in the `_a` literal - -NGP_NAMESPACE_BEGIN - +namespace ngp { void Testbed::Nerf::Training::set_image(int frame_idx, pybind11::array_t img, pybind11::array_t depth_img, float depth_scale) { if (frame_idx < 0 || frame_idx >= dataset.n_images) { @@ -63,7 +59,7 @@ void Testbed::Nerf::Training::set_image(int frame_idx, pybind11::array_t py::buffer_info depth_buf = depth_img.request(); - dataset.set_training_image(frame_idx, {img_buf.shape[1], img_buf.shape[0]}, (const void*)img_buf.ptr, (const float*)depth_buf.ptr, depth_scale, false, EImageDataType::Float, EDepthDataType::Float); + dataset.set_training_image(frame_idx, {(int)img_buf.shape[1], (int)img_buf.shape[0]}, (const void*)img_buf.ptr, (const float*)depth_buf.ptr, depth_scale, false, EImageDataType::Float, EDepthDataType::Float); } void Testbed::override_sdf_training_data(py::array_t points, py::array_t distances) { @@ -99,7 +95,7 @@ void Testbed::override_sdf_training_data(py::array_t points, py::array_t< } pybind11::dict Testbed::compute_marching_cubes_mesh(ivec3 res3d, BoundingBox aabb, float thresh) { - mat3 render_aabb_to_local = mat3(1.0f); + mat3 render_aabb_to_local = mat3::identity(); if (aabb.is_empty()) { aabb = m_testbed_mode == ETestbedMode::Nerf ? m_render_aabb : m_aabb; render_aabb_to_local = m_render_aabb_to_local; @@ -121,7 +117,7 @@ pybind11::dict Testbed::compute_marching_cubes_mesh(ivec3 res3d, BoundingBox aab ns[i] = normalize(ns[i]); } - return py::dict("V"_a=cpuverts, "N"_a=cpunormals, "C"_a=cpucolors, "F"_a=cpuindices); + return py::dict(py::arg("V")=cpuverts, py::arg("N")=cpunormals, py::arg("C")=cpucolors, py::arg("F")=cpuindices); } py::array_t Testbed::render_to_cpu(int width, int height, int spp, bool linear, float start_time, float end_time, float fps, float shutter_fraction) { @@ -237,7 +233,7 @@ py::array_t Testbed::view(bool linear, size_t view_idx) const { #ifdef NGP_GUI py::array_t Testbed::screenshot(bool linear, bool front_buffer) const { - std::vector tmp(compMul(m_window_res) * 4); + std::vector tmp(product(m_window_res) * 4); glReadBuffer(front_buffer ? GL_FRONT : GL_BACK); glReadPixels(0, 0, m_window_res.x, m_window_res.y, GL_RGBA, GL_FLOAT, tmp.data()); @@ -266,7 +262,7 @@ py::array_t Testbed::screenshot(bool linear, bool front_buffer) const { PYBIND11_MODULE(pyngp, m) { m.doc() = "Instant neural graphics primitives"; - m.def("free_temporary_memory", &tcnn::free_all_gpu_memory_arenas); + m.def("free_temporary_memory", &free_all_gpu_memory_arenas); py::enum_(m, "TestbedMode") .value("Nerf", ETestbedMode::Nerf) @@ -514,7 +510,6 @@ PYBIND11_MODULE(pyngp, m) { .def_readwrite("screen_center", &Testbed::m_screen_center) .def_readwrite("training_batch_size", &Testbed::m_training_batch_size) .def("set_nerf_camera_matrix", &Testbed::set_nerf_camera_matrix) - .def("add_training_views_to_camera_path", &Testbed::add_training_views_to_camera_path) .def("set_camera_to_training_view", &Testbed::set_camera_to_training_view) .def("first_training_view", &Testbed::first_training_view) .def("last_training_view", &Testbed::last_training_view) @@ -566,7 +561,7 @@ PYBIND11_MODULE(pyngp, m) { .def("crop_box_corners", &Testbed::crop_box_corners, py::arg("nerf_space") = true) .def_property("root_dir", [](py::object& obj) { return obj.cast().root_dir().str(); }, - [](const py::object& obj, const std::string& value) { obj.cast().m_root_dir = value; } + [](const py::object& obj, const std::string& value) { obj.cast().set_root_dir(value); } ) ; @@ -579,7 +574,6 @@ PYBIND11_MODULE(pyngp, m) { }) ; - py::class_ nerf(testbed, "Nerf"); nerf .def_readonly("training", &Testbed::Nerf::training) @@ -597,6 +591,12 @@ PYBIND11_MODULE(pyngp, m) { .def_readwrite("visualize_cameras", &Testbed::Nerf::visualize_cameras) .def_readwrite("glow_y_cutoff", &Testbed::Nerf::glow_y_cutoff) .def_readwrite("glow_mode", &Testbed::Nerf::glow_mode) + .def_readwrite("render_gbuffer_hard_edges", &Testbed::Nerf::render_gbuffer_hard_edges) + .def_readwrite("rendering_extra_dims_from_training_view", &Testbed::Nerf::rendering_extra_dims_from_training_view, "If non-negative, indicates the training view from which the extra dims are used. If -1, uses the values previously set by `set_rendering_extra_dims`.") + .def("find_closest_training_view", &Testbed::Nerf::find_closest_training_view, "Obtain the training view that is closest to the current camera.") + .def("set_rendering_extra_dims_from_training_view", &Testbed::Nerf::set_rendering_extra_dims_from_training_view, "Set the extra dims that are used for rendering to those that were trained for a given training view.") + .def("set_rendering_extra_dims", &Testbed::Nerf::set_rendering_extra_dims, "Set the extra dims that are used for rendering.") + .def("get_rendering_extra_dims", &Testbed::Nerf::get_rendering_extra_dims_cpu, "Get the extra dims that are currently used for rendering.") ; py::class_ brdfparams(m, "BRDFParams"); @@ -649,6 +649,7 @@ PYBIND11_MODULE(pyngp, m) { .def_readwrite("depth_loss_type", &Testbed::Nerf::Training::depth_loss_type) .def_readwrite("snap_to_pixel_centers", &Testbed::Nerf::Training::snap_to_pixel_centers) .def_readwrite("optimize_extrinsics", &Testbed::Nerf::Training::optimize_extrinsics) + .def_readwrite("optimize_per_image_latents", &Testbed::Nerf::Training::optimize_extra_dims) .def_readwrite("optimize_extra_dims", &Testbed::Nerf::Training::optimize_extra_dims) .def_readwrite("optimize_exposure", &Testbed::Nerf::Training::optimize_exposure) .def_readwrite("optimize_distortion", &Testbed::Nerf::Training::optimize_distortion) @@ -667,6 +668,7 @@ PYBIND11_MODULE(pyngp, m) { .def_readwrite("exposure_l2_reg", &Testbed::Nerf::Training::exposure_l2_reg) .def_readwrite("depth_supervision_lambda", &Testbed::Nerf::Training::depth_supervision_lambda) .def_readonly("dataset", &Testbed::Nerf::Training::dataset) + .def("get_extra_dims", &Testbed::Nerf::Training::get_extra_dims_cpu, "Get the extra dims (including trained latent code) for a specified training view.") .def("set_camera_intrinsics", &Testbed::Nerf::Training::set_camera_intrinsics, py::arg("frame_idx"), py::arg("fx")=0.f, py::arg("fy")=0.f, @@ -728,4 +730,4 @@ PYBIND11_MODULE(pyngp, m) { ; } -NGP_NAMESPACE_END +} diff --git a/src/render_buffer.cu b/src/render_buffer.cu index 433bb6d52..0ba1fad1e 100644 --- a/src/render_buffer.cu +++ b/src/render_buffer.cu @@ -33,9 +33,7 @@ #include -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { extern std::atomic g_total_n_bytes_allocated; @@ -46,7 +44,7 @@ void CudaSurface2D::free() { m_surface = 0; if (m_array) { cudaFreeArray(m_array); - g_total_n_bytes_allocated -= compMul(m_size) * sizeof(float) * m_n_channels; + g_total_n_bytes_allocated -= product(m_size) * sizeof(float) * m_n_channels; } m_array = nullptr; m_size = ivec2(0); @@ -70,7 +68,7 @@ void CudaSurface2D::resize(const ivec2& size, int n_channels) { } CUDA_CHECK_THROW(cudaMallocArray(&m_array, &desc, size.x, size.y, cudaArraySurfaceLoadStore)); - g_total_n_bytes_allocated += compMul(m_size) * sizeof(float) * n_channels; + g_total_n_bytes_allocated += product(m_size) * sizeof(float) * n_channels; struct cudaResourceDesc resource_desc; memset(&resource_desc, 0, sizeof(resource_desc)); @@ -198,7 +196,7 @@ GLTexture::CUDAMapping::CUDAMapping(GLuint texture_id, const ivec2& size, int n_ // falling back to a regular cuda surface + CPU copy of data m_cuda_surface = std::make_unique(); m_cuda_surface->resize(size, n_channels); - m_data_cpu.resize(compMul(m_size) * n_channels); + m_data_cpu.resize(product(m_size) * n_channels); return; } @@ -253,10 +251,10 @@ __global__ void accumulate_kernel(ivec2 resolution, vec4* frame_buffer, vec4* ac break; } case EColorSpace::SRGB: - color.rgb = linear_to_srgb(color.rgb); + color.rgb() = linear_to_srgb(color.rgb()); // fallthrough is intended! case EColorSpace::Linear: - tmp.rgb = (tmp.rgb * sample_count + color.rgb) / (sample_count+1); break; + tmp.rgb() = (tmp.rgb() * sample_count + color.rgb()) / (sample_count+1); break; } tmp.a = (tmp.a * sample_count + color.a) / (sample_count+1); @@ -307,7 +305,7 @@ __device__ vec3 tonemap(vec3 x, ETonemapCurve curve) { k3 = 4.0f * k3; k4 = 2.0f * k4; } else { //if (curve == ETonemapCurve::Reinhard) - const vec3 luminance_coefficients = vec3(0.2126f, 0.7152f, 0.0722f); + const vec3 luminance_coefficients = {0.2126f, 0.7152f, 0.0722f}; float Y = dot(luminance_coefficients, x); return x * (1.f / (Y + 1.0f)); @@ -392,20 +390,20 @@ __global__ void overlay_image_kernel( // The background color is represented in SRGB, so convert // to linear if that's not the space in which we're rendering. if (color_space != EColorSpace::SRGB) { - background_color.xyz = srgb_to_linear(background_color.xyz); + background_color.xyz() = srgb_to_linear(background_color.xyz()); } else { if (color.a > 0) { - color.rgb = linear_to_srgb(color.rgb() / color.a) * color.a; + color.rgb() = linear_to_srgb(color.rgb() / color.a) * color.a; } else { - color.rgb = vec3(0.0f); + color.rgb() = vec3(0.0f); } } float weight = (1 - color.a) * background_color.a; - color.rgb += background_color.rgb * weight; + color.rgb() += background_color.rgb() * weight; color.a += weight; - color.rgb = tonemap(color.rgb, exposure, tonemap_curve, color_space, output_color_space); + color.rgb() = tonemap(color.rgb(), exposure, tonemap_curve, color_space, output_color_space); vec4 prev_color; surf2Dread((float4*)&prev_color, surface, x * sizeof(float4), y); @@ -414,20 +412,20 @@ __global__ void overlay_image_kernel( } __device__ vec3 colormap_turbo(float x) { - const vec4 kRedVec4 = vec4(0.13572138f, 4.61539260f, -42.66032258f, 132.13108234f); - const vec4 kGreenVec4 = vec4(0.09140261f, 2.19418839f, 4.84296658f, -14.18503333f); - const vec4 kBlueVec4 = vec4(0.10667330f, 12.64194608f, -60.58204836f, 110.36276771f); - const vec2 kRedVec2 = vec2(-152.94239396f, 59.28637943f); - const vec2 kGreenVec2 = vec2(4.27729857f, 2.82956604f); - const vec2 kBlueVec2 = vec2(-89.90310912f, 27.34824973f); + const vec4 kRedVec4 = {0.13572138f, 4.61539260f, -42.66032258f, 132.13108234f}; + const vec4 kGreenVec4 = {0.09140261f, 2.19418839f, 4.84296658f, -14.18503333f}; + const vec4 kBlueVec4 = {0.10667330f, 12.64194608f, -60.58204836f, 110.36276771f}; + const vec2 kRedVec2 = {-152.94239396f, 59.28637943f}; + const vec2 kGreenVec2 = {4.27729857f, 2.82956604f}; + const vec2 kBlueVec2 = {-89.90310912f, 27.34824973f}; x = __saturatef(x); - vec4 v4 = vec4{ 1.0f, x, x * x, x * x * x }; - vec2 v2 = vec2{ v4.w * x, v4.w * v4.z }; - return vec3{ + vec4 v4 = { 1.0f, x, x * x, x * x * x }; + vec2 v2 = { v4.w * x, v4.w * v4.z }; + return { dot(v4, kRedVec4) + dot(v2, kRedVec2), dot(v4, kGreenVec4) + dot(v2, kGreenVec2), - dot(v4, kBlueVec4) + dot(v2, kBlueVec2) + dot(v4, kBlueVec4) + dot(v2, kBlueVec2), }; } @@ -504,8 +502,8 @@ __global__ void overlay_false_color_kernel(ivec2 resolution, ivec2 training_reso float scale = training_resolution[fov_axis] / float(resolution[fov_axis]); float u = (x+0.5f-resolution.x*0.5f) * scale + training_resolution.x*0.5f; float v = (y+0.5f-resolution.y*0.5f) * scale + training_resolution.y*0.5f; - int srcx = floorf(u * error_map_resolution.x / float(max(1.f, (float)training_resolution.x))); - int srcy = floorf(v * error_map_resolution.y / float(max(1.f, (float)training_resolution.y))); + int srcx = floor(u * error_map_resolution.x / float(max(1.f, (float)training_resolution.x))); + int srcy = floor(v * error_map_resolution.y / float(max(1.f, (float)training_resolution.y))); uint32_t srcidx = srcx + error_map_resolution.x * srcy; @@ -541,18 +539,18 @@ __global__ void tonemap_kernel(ivec2 resolution, float exposure, vec4 background // The background color is represented in SRGB, so convert // to linear if that's not the space in which we're rendering. if (color_space != EColorSpace::SRGB) { - background_color.rgb = srgb_to_linear(background_color.rgb); + background_color.rgb() = srgb_to_linear(background_color.rgb()); } vec4 color = accumulate_buffer[idx]; float weight = (1 - color.a) * background_color.a; - color.rgb += background_color.rgb * weight; + color.rgb() += background_color.rgb() * weight; color.a += weight; - color.rgb = tonemap(color.rgb, vec3(exposure), tonemap_curve, color_space, output_color_space); + color.rgb() = tonemap(color.rgb(), vec3(exposure), tonemap_curve, color_space, output_color_space); if (unmultiply_alpha && color.a > 0.0f) { - color.rgb = color.rgb() / color.a; + color.rgb() = color.rgb() / color.a; } if (clamp_output_color) { @@ -603,7 +601,7 @@ __global__ void depth_splat_kernel( } void CudaRenderBufferView::clear(cudaStream_t stream) const { - size_t n_pixels = compMul(resolution); + size_t n_pixels = product(resolution); CUDA_CHECK_THROW(cudaMemsetAsync(frame_buffer, 0, n_pixels * sizeof(vec4), stream)); CUDA_CHECK_THROW(cudaMemsetAsync(depth_buffer, 0, n_pixels * sizeof(float), stream)); } @@ -791,4 +789,4 @@ void CudaRenderBuffer::disable_dlss() { m_dlss = nullptr; } -NGP_NAMESPACE_END +} diff --git a/src/testbed.cu b/src/testbed.cu index ce10a0385..78b000cc6 100644 --- a/src/testbed.cu +++ b/src/testbed.cu @@ -12,8 +12,8 @@ * @author Thomas Müller & Alex Evans, NVIDIA */ -#include #include +#include #include #include #include @@ -28,8 +28,8 @@ #include #include -#include #include +#include #include #include @@ -45,18 +45,18 @@ #include #ifdef NGP_GUI -# include -# include -# include -# include -# ifdef _WIN32 -# include -# else -# include -# endif -# include -# include -# include +# include +# include +# include +# include +# ifdef _WIN32 +# include +# else +# include +# endif +# include +# include +# include #endif @@ -68,9 +68,8 @@ using namespace std::literals::chrono_literals; -using namespace tcnn; -NGP_NAMESPACE_BEGIN +namespace ngp { int do_system(const std::string& cmd) { #ifdef _WIN32 @@ -358,7 +357,7 @@ void Testbed::load_file(const fs::path& path) { // want to immediately start training on that data. So: go for it. m_train = true; } - } catch (std::runtime_error& e) { + } catch (const std::runtime_error& e) { tlog::error() << "Failed to load training data: " << e.what(); } } @@ -443,19 +442,6 @@ void Testbed::next_training_view() { set_camera_to_training_view(m_nerf.training.view); } -void Testbed::add_training_views_to_camera_path() { - for (int i = 0; i < m_nerf.training.dataset.n_images; ++i) { - int n = std::max(0, int(m_camera_path.keyframes.size()) - 1); - auto camera = get_xform_given_rolling_shutter(m_nerf.training.transforms[i], m_nerf.training.dataset.metadata[i].rolling_shutter, vec2{0.5f, 0.5f}, 0.0f); - int j = (int) ceil(m_camera_path.play_time * (float) n + 0.001f); - if (j > m_camera_path.keyframes.size()) j = m_camera_path.keyframes.size(); - if (j < 0) j = 0; - m_camera_path.keyframes.insert(m_camera_path.keyframes.begin() + j, CameraKeyframe(camera, m_slice_plane_z, m_scale, fov(), m_aperture_size, m_nerf.glow_mode, m_nerf.glow_y_cutoff)); - n = std::max(0, int(m_camera_path.keyframes.size()) - 1); - m_camera_path.play_time = n ? float(j) / float(n) : 1.f; - } -} - void Testbed::set_camera_to_training_view(int trainview) { auto old_look_at = look_at(); m_camera = m_smoothed_camera = get_xform_given_rolling_shutter(m_nerf.training.transforms[trainview], m_nerf.training.dataset.metadata[trainview].rolling_shutter, vec2{0.5f, 0.5f}, 0.0f); @@ -487,11 +473,11 @@ void Testbed::reset_camera() { m_scale = 1.5f; } - m_camera = transpose(mat3x4( + m_camera = transpose(mat3x4{ 1.0f, 0.0f, 0.0f, 0.5f, 0.0f, -1.0f, 0.0f, 0.5f, 0.0f, 0.0f, -1.0f, 0.5f - )); + }); m_camera[3] -= m_scale * view_dir(); @@ -509,7 +495,7 @@ void Testbed::set_train(bool mtrain) { } void Testbed::compute_and_save_marching_cubes_mesh(const fs::path& filename, ivec3 res3d , BoundingBox aabb, float thresh, bool unwrap_it) { - mat3 render_aabb_to_local = mat3(1.0f); + mat3 render_aabb_to_local = mat3::identity(); if (aabb.is_empty()) { aabb = m_testbed_mode == ETestbedMode::Nerf ? m_render_aabb : m_aabb; render_aabb_to_local = m_render_aabb_to_local; @@ -519,7 +505,7 @@ void Testbed::compute_and_save_marching_cubes_mesh(const fs::path& filename, ive } ivec3 Testbed::compute_and_save_png_slices(const fs::path& filename, int res, BoundingBox aabb, float thresh, float density_range, bool flip_y_and_z_axes) { - mat3 render_aabb_to_local = mat3(1.0f); + mat3 render_aabb_to_local = mat3::identity(); if (aabb.is_empty()) { aabb = m_testbed_mode == ETestbedMode::Nerf ? m_render_aabb : m_aabb; render_aabb_to_local = m_render_aabb_to_local; @@ -547,12 +533,16 @@ ivec3 Testbed::compute_and_save_png_slices(const fs::path& filename, int res, Bo fs::path Testbed::root_dir() { if (m_root_dir.empty()) { - m_root_dir = get_root_dir(); + set_root_dir(discover_root_dir()); } return m_root_dir; } +void Testbed::set_root_dir(const fs::path& dir) { + m_root_dir = dir; +} + inline float linear_to_db(float x) { return -10.f*logf(x)/logf(10.f); } @@ -619,8 +609,10 @@ void Testbed::set_crop_box(mat4x3 m, bool nerf_space) { if (nerf_space) { m = m_nerf.training.dataset.nerf_matrix_to_ngp(m, true); } - vec3 radius(length(m[0]), length(m[1]), length(m[2])); + + vec3 radius{length(m[0]), length(m[1]), length(m[2])}; vec3 cen(m[3]); + m_render_aabb_to_local = row(m_render_aabb_to_local, 0, m[0] / radius.x); m_render_aabb_to_local = row(m_render_aabb_to_local, 1, m[1] / radius.y); m_render_aabb_to_local = row(m_render_aabb_to_local, 2, m[2] / radius.z); @@ -633,7 +625,7 @@ std::vector Testbed::crop_box_corners(bool nerf_space) const { mat4x3 m = crop_box(nerf_space); std::vector rv(8); for (int i = 0; i < 8; ++i) { - rv[i] = m * vec4((i & 1) ? 1.f : -1.f, (i & 2) ? 1.f : -1.f, (i & 4) ? 1.f : -1.f, 1.f); + rv[i] = m * vec4{(i & 1) ? 1.f : -1.f, (i & 2) ? 1.f : -1.f, (i & 4) ? 1.f : -1.f, 1.f}; /* debug print out corners to check math is all lined up */ if (0) { tlog::info() << rv[i].x << "," << rv[i].y << "," << rv[i].z << " [" << i << "]"; @@ -682,7 +674,7 @@ void Testbed::imgui() { fov(), m_aperture_size, m_bounding_radius, - !m_nerf.training.dataset.xforms.empty() ? m_nerf.training.dataset.xforms[0].start : mat4x3(1.0f), + !m_nerf.training.dataset.xforms.empty() ? m_nerf.training.dataset.xforms[0].start : mat4x3::identity(), m_nerf.glow_mode, m_nerf.glow_y_cutoff )) { @@ -964,7 +956,7 @@ void Testbed::imgui() { ImGui::DragInt("Seed", (int*)&m_seed, 1.0f, 0, std::numeric_limits::max()); ImGui::PopItemWidth(); - m_training_batch_size = next_multiple(m_training_batch_size, batch_size_granularity); + m_training_batch_size = next_multiple(m_training_batch_size, BATCH_SIZE_GRANULARITY); if (m_train) { std::vector timings; @@ -1161,8 +1153,8 @@ void Testbed::imgui() { set_exposure(m_exposure); } - float max_diam = compMax(m_aabb.max - m_aabb.min); - float render_diam = compMax(m_render_aabb.max - m_render_aabb.min); + float max_diam = max(m_aabb.max - m_aabb.min); + float render_diam = max(m_render_aabb.max - m_render_aabb.min); float old_render_diam = render_diam; if (m_testbed_mode == ETestbedMode::Nerf || m_testbed_mode == ETestbedMode::Volume) { @@ -1218,7 +1210,7 @@ void Testbed::imgui() { ImGui::Separator(); vec3 diag = m_render_aabb.diag(); bool edit_diag = false; - float max_diag = compMax(m_aabb.diag()); + float max_diag = max(m_aabb.diag()); edit_diag |= ImGui::SliderFloat("Size x", ((float*)&diag)+0, 0.001f, max_diag, "%.3f"); edit_diag |= ImGui::SliderFloat("Size y", ((float*)&diag)+1, 0.001f, max_diag, "%.3f"); edit_diag |= ImGui::SliderFloat("Size z", ((float*)&diag)+2, 0.001f, max_diag, "%.3f"); @@ -1231,14 +1223,14 @@ void Testbed::imgui() { if (ImGui::Button("Reset crop box")) { accum_reset = true; m_render_aabb = m_aabb; - m_render_aabb_to_local = mat3(1.0f); + m_render_aabb_to_local = mat3::identity(); } ImGui::SameLine(); if (ImGui::Button("rotation only")) { accum_reset = true; vec3 world_cen = transpose(m_render_aabb_to_local) * m_render_aabb.center(); - m_render_aabb_to_local = mat3(1.0f); + m_render_aabb_to_local = mat3::identity(); vec3 new_cen = m_render_aabb_to_local * world_cen; vec3 old_cen = m_render_aabb.center(); m_render_aabb.min += new_cen - old_cen; @@ -1287,14 +1279,15 @@ void Testbed::imgui() { } if (m_nerf.training.dataset.n_extra_learnable_dims) { - accum_reset |= ImGui::SliderInt("training image latent code for inference", (int*)&m_nerf.extra_dim_idx_for_inference, 0, m_nerf.training.dataset.n_images-1); + accum_reset |= ImGui::SliderInt("Rendering extra dims from training view", (int*)&m_nerf.rendering_extra_dims_from_training_view, -1, m_nerf.training.dataset.n_images-1); } + accum_reset |= ImGui::Checkbox("Gbuffer hard edges", &m_nerf.render_gbuffer_hard_edges); + accum_reset |= ImGui::Combo("Groundtruth render mode", (int*)&m_ground_truth_render_mode, GroundTruthRenderModeStr); accum_reset |= ImGui::SliderFloat("Groundtruth alpha", &m_ground_truth_alpha, 0.0f, 1.0f, "%.02f", ImGuiSliderFlags_AlwaysClamp); bool lens_changed = ImGui::Checkbox("Apply lens distortion", &m_nerf.render_with_lens_distortion); - if (m_nerf.render_with_lens_distortion) { lens_changed |= ImGui::Combo("Lens mode", (int*)&m_nerf.render_lens.mode, LensModeStr); if (m_nerf.render_lens.mode == ELensMode::OpenCV) { @@ -1320,10 +1313,10 @@ void Testbed::imgui() { if (lens_changed && !supports_dlss(m_nerf.render_lens.mode)) { m_dlss = false; } - - accum_reset |= lens_changed; } + accum_reset |= lens_changed; + accum_reset |= ImGui::SliderFloat("Min transmittance", &m_nerf.render_min_transmittance, 0.0f, 1.0f, "%.3f", ImGuiSliderFlags_Logarithmic | ImGuiSliderFlags_NoRoundToFormat); ImGui::TreePop(); } @@ -1397,10 +1390,6 @@ void Testbed::imgui() { } if (m_testbed_mode == ETestbedMode::Nerf) { - if (ImGui::Button("Add training views to camera path")) { - add_training_views_to_camera_path(); - } - if (ImGui::Button("First")) { first_training_view(); } @@ -1546,7 +1535,7 @@ void Testbed::imgui() { if (ImGui::Button("Save")) { try { save_snapshot(m_imgui.snapshot_path, m_include_optimizer_state_in_snapshot, m_compress_snapshot); - } catch (std::exception& e) { + } catch (const std::exception& e) { imgui_error_string = fmt::format("Failed to save snapshot: {}", e.what()); ImGui::OpenPopup("Error"); } @@ -1555,7 +1544,7 @@ void Testbed::imgui() { if (ImGui::Button("Load")) { try { load_snapshot(m_imgui.snapshot_path); - } catch (std::exception& e) { + } catch (const std::exception& e) { imgui_error_string = fmt::format("Failed to load snapshot: {}", e.what()); ImGui::OpenPopup("Error"); } @@ -1630,7 +1619,7 @@ void Testbed::imgui() { auto effective_view_dir = flip_y_and_z_axes ? vec3{0.0f, 1.0f, 0.0f} : vec3{0.0f, 0.0f, 1.0f}; auto old_local = m_render_aabb_to_local; auto old_aabb = m_render_aabb; - m_render_aabb_to_local = mat3(1.0f); + m_render_aabb_to_local = mat3::identity(); auto dir = m_data_path.is_directory() || m_data_path.empty() ? (m_data_path / "volume_raw") : (m_data_path.parent_path() / fmt::format("{}_volume_raw", m_data_path.filename())); if (!dir.exists()) { fs::create_directory(dir); @@ -1687,7 +1676,7 @@ void Testbed::imgui() { accum_reset |= ImGui::SliderFloat("Clearcoat", &m_sdf.brdf.clearcoat, 0.f, 1.f); accum_reset |= ImGui::SliderFloat("Clearcoat gloss", &m_sdf.brdf.clearcoat_gloss, 0.f, 1.f); } - m_sdf.brdf.ambientcolor = (m_background_color * m_background_color).rgb; + m_sdf.brdf.ambientcolor = (m_background_color * m_background_color).rgb(); } if (ImGui::CollapsingHeader("Histograms of encoding parameters")) { @@ -1704,11 +1693,11 @@ void Testbed::imgui() { // Hashgrid statistics - for (int i = 0; i < m_n_levels; ++i) { + for (uint32_t i = 0; i < m_n_levels; ++i) { f[i] = m_level_stats[i].mean(); } ImGui::PlotHistogram("Grid means", f.data(), m_n_levels, 0, "means", FLT_MAX, FLT_MAX, ImVec2(0, 60.f)); - for (int i = 0; i < m_n_levels; ++i) { + for (uint32_t i = 0; i < m_n_levels; ++i) { f[i] = m_level_stats[i].sigma(); } ImGui::PlotHistogram("Grid sigmas", f.data(), m_n_levels, 0, "sigma", FLT_MAX, FLT_MAX, ImVec2(0, 60.f)); @@ -1716,7 +1705,7 @@ void Testbed::imgui() { // Histogram of trained hashgrid params - ImGui::SliderInt("Show details for level", &m_histo_level, 0, m_n_levels - 1); + ImGui::SliderInt("Show details for level", (int*)&m_histo_level, 0, m_n_levels - 1); if (m_histo_level < m_n_levels) { LevelStats& s = m_level_stats[m_histo_level]; static bool excludezero = false; @@ -1775,12 +1764,12 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix) float xyscale = (float)m_window_res[m_fov_axis]; vec2 screen_center = render_screen_center(m_screen_center); - mat4 view2proj = transpose(mat4( + mat4 view2proj = transpose(mat4{ xyscale, 0.0f, (float)m_window_res.x*screen_center.x * zscale, 0.0f, 0.0f, xyscale, (float)m_window_res.y*screen_center.y * zscale, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, zscale, 0.0f - )); + 0.0f, 0.0f, zscale, 0.0f, + }); mat4 world2proj = view2proj * world2view; float aspect = (float)m_window_res.x / (float)m_window_res.y; @@ -1793,7 +1782,7 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix) } if (m_visualize_unit_cube) { - visualize_cube(list, world2proj, vec3(0.f), vec3(1.f), mat3(1.0f)); + visualize_cube(list, world2proj, vec3(0.f), vec3(1.f), mat3::identity()); } if (m_edit_render_aabb) { @@ -1806,17 +1795,17 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix) float fly = focal.y; float zfar = m_ndc_zfar; float znear = m_ndc_znear; - mat4 view2proj_guizmo = transpose(mat4( + mat4 view2proj_guizmo = transpose(mat4{ fly * 2.0f / aspect, 0.0f, 0.0f, 0.0f, 0.0f, -fly * 2.f, 0.0f, 0.0f, 0.0f, 0.0f, (zfar + znear) / (zfar - znear), -(2.0f * zfar * znear) / (zfar - znear), - 0.0f, 0.0f, 1.0f, 0.0f - )); + 0.0f, 0.0f, 1.0f, 0.0f, + }); ImGuizmo::SetRect(0, 0, io.DisplaySize.x, io.DisplaySize.y); - static mat4 matrix = mat4(1.0f); - static mat4 world2view_guizmo = mat4(1.0f); + static mat4 matrix = mat4::identity(); + static mat4 world2view_guizmo = mat4::identity(); vec3 cen = transpose(m_render_aabb_to_local) * m_render_aabb.center(); if (!ImGuizmo::IsUsing()) { @@ -1834,7 +1823,6 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix) auto prev_matrix = matrix; if (ImGuizmo::Manipulate((const float*)&world2view_guizmo, (const float*)&view2proj_guizmo, m_camera_path.m_gizmo_op, ImGuizmo::LOCAL, (float*)&matrix, NULL, NULL)) { - auto crop_transform = matrix; if (m_edit_world_transform) { // We transform the world by transforming the camera in the opposite direction. auto rel = prev_matrix * inverse(matrix); @@ -1844,7 +1832,7 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix) m_up_dir = mat3(rel) * m_up_dir; } else { m_render_aabb_to_local = transpose(mat3(matrix)); - vec3 new_cen = m_render_aabb_to_local * matrix[3].xyz; + vec3 new_cen = m_render_aabb_to_local * matrix[3].xyz(); vec3 old_cen = m_render_aabb.center(); m_render_aabb.min += new_cen - old_cen; m_render_aabb.max += new_cen - old_cen; @@ -1921,9 +1909,6 @@ bool Testbed::keyboard_event() { if (ImGui::IsKeyPressed('G')) { m_render_ground_truth = !m_render_ground_truth; reset_accumulation(); - if (m_render_ground_truth) { - m_nerf.training.view = find_best_training_view(m_nerf.training.view); - } } if (ImGui::IsKeyPressed('T')) { @@ -2029,7 +2014,7 @@ bool Testbed::keyboard_event() { translate_vec *= m_camera_velocity * m_frame_ms.val() / 1000.0f; if (shift) { - translate_vec *= 5; + translate_vec *= 5.0f; } if (translate_vec != vec3(0.0f)) { @@ -2056,7 +2041,7 @@ void Testbed::mouse_wheel() { // When in image mode, zoom around the hovered point. if (m_testbed_mode == ETestbedMode::Image) { - ivec2 mouse = {ImGui::GetMousePos().x, ImGui::GetMousePos().y}; + vec2 mouse = {ImGui::GetMousePos().x, ImGui::GetMousePos().y}; vec3 offset = get_3d_pos_from_pixel(*m_views.front().render_buffer, mouse) - look_at(); // Don't center around infinitely distant points. @@ -2076,9 +2061,8 @@ mat3 Testbed::rotation_from_angles(const vec2& angles) const { void Testbed::mouse_drag() { vec2 rel = vec2{ImGui::GetIO().MouseDelta.x, ImGui::GetIO().MouseDelta.y} / (float)m_window_res[m_fov_axis]; - ivec2 mouse = {ImGui::GetMousePos().x, ImGui::GetMousePos().y}; + vec2 mouse = {ImGui::GetMousePos().x, ImGui::GetMousePos().y}; - vec3 up = m_up_dir; vec3 side = m_camera[0]; bool shift = ImGui::GetIO().KeyMods & ImGuiKeyModFlags_Shift; @@ -2122,7 +2106,7 @@ void Testbed::mouse_drag() { // Middle pressed if (ImGui::GetIO().MouseClicked[2]) { - m_drag_depth = get_depth_from_renderbuffer(*m_views.front().render_buffer, vec2(mouse) / vec2(m_window_res)); + m_drag_depth = get_depth_from_renderbuffer(*m_views.front().render_buffer, mouse / vec2(m_window_res)); } // Middle held @@ -2176,11 +2160,12 @@ void Testbed::handle_user_input() { if (m_testbed_mode == ETestbedMode::Nerf && (m_render_ground_truth || m_nerf.training.render_error_overlay)) { // find nearest training view to current camera, and set it - int bestimage = find_best_training_view(-1); - if (bestimage >= 0) { - m_nerf.training.view = bestimage; - if (ImGui::GetIO().MouseReleased[0]) {// snap camera to ground truth view on mouse up - set_camera_to_training_view(m_nerf.training.view); + int bestimage = m_nerf.find_closest_training_view(m_camera); + m_nerf.training.view = bestimage; + if (ImGui::GetIO().MouseReleased[0]) { // snap camera to ground truth view on mouse up + set_camera_to_training_view(m_nerf.training.view); + if (m_nerf.training.dataset.n_extra_dims()) { + m_nerf.set_rendering_extra_dims_from_training_view(m_nerf.training.view); } } } @@ -2216,7 +2201,7 @@ void Testbed::begin_vr_frame_and_handle_vr_input() { if (n_views > 0) { set_n_views(n_views); - ivec2 total_size = ivec2(0); + ivec2 total_size = 0; for (size_t i = 0; i < n_views; ++i) { ivec2 view_resolution = {views[i].view.subImage.imageRect.extent.width, views[i].view.subImage.imageRect.extent.height}; total_size += view_resolution; @@ -2522,7 +2507,7 @@ void Testbed::draw_gui() { glBlendEquationSeparate(GL_FUNC_ADD, GL_FUNC_ADD); glBlendFuncSeparate(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA); - ivec2 extent = ivec2((float)display_w / m_n_views.x, (float)display_h / m_n_views.y); + ivec2 extent = {(int)((float)display_w / m_n_views.x), (int)((float)display_h / m_n_views.y)}; int i = 0; for (int y = 0; y < m_n_views.y; ++y) { @@ -2547,7 +2532,7 @@ void Testbed::draw_gui() { auto draw_mesh = [&]() { glClear(GL_DEPTH_BUFFER_BIT); - ivec2 res(display_w, display_h); + ivec2 res = {display_w, display_h}; vec2 focal_length = calc_focal_length(res, m_relative_focal_length, m_fov_axis, m_zoom); draw_mesh_gl(m_mesh.verts, m_mesh.vert_normals, m_mesh.vert_colors, m_mesh.indices, res, focal_length, m_smoothed_camera, render_screen_center(m_screen_center), (int)m_mesh_render_mode); }; @@ -2597,11 +2582,11 @@ __global__ void to_8bit_color_kernel( surf2Dread((float4*)&color, surface, x * sizeof(float4), y); if (output_color_space == EColorSpace::Linear) { - color.rgb = linear_to_srgb(color.rgb); + color.rgb() = linear_to_srgb(color.rgb()); } for (uint32_t i = 0; i < 3; ++i) { - result[(x + resolution.x * y) * 3 + i] = (uint8_t)(tcnn::clamp(color[i], 0.0f, 1.0f) * 255.0f + 0.5f); + result[(x + resolution.x * y) * 3 + i] = (uint8_t)(clamp(color[i], 0.0f, 1.0f) * 255.0f + 0.5f); } } @@ -2627,7 +2612,7 @@ void Testbed::prepare_next_camera_path_frame() { const dim3 threads = { 16, 8, 1 }; const dim3 blocks = { div_round_up((uint32_t)res.x, threads.x), div_round_up((uint32_t)res.y, threads.y), 1 }; - GPUMemory image_data(compMul(res) * 3); + GPUMemory image_data(product(res) * 3); to_8bit_color_kernel<<>>( res, EColorSpace::SRGB, // the GUI always renders in SRGB @@ -2757,7 +2742,7 @@ void Testbed::train_and_render(bool skip_rendering) { m_render_ms.update(std::chrono::duration(std::chrono::steady_clock::now()-start).count()); }}; - if (norm(m_smoothed_camera - m_camera) < 0.001f) { + if (frobenius_norm(m_smoothed_camera - m_camera) < 0.001f) { m_smoothed_camera = m_camera; } else if (!m_camera_path.rendering) { reset_accumulation(true); @@ -2773,7 +2758,7 @@ void Testbed::train_and_render(bool skip_rendering) { view.visualized_dimension = m_visualized_dimension; } - m_n_views = {m_views.size(), 1}; + m_n_views = {(int)m_views.size(), 1}; m_nerf.render_with_lens_distortion = false; reset_accumulation(true); @@ -2849,8 +2834,8 @@ void Testbed::train_and_render(bool skip_rendering) { size_t n_pixels = 0, n_pixels_full_res = 0; for (const auto& view : m_views) { - n_pixels += compMul(view.render_buffer->in_resolution()); - n_pixels_full_res += compMul(view.full_resolution); + n_pixels += product(view.render_buffer->in_resolution()); + n_pixels_full_res += product(view.full_resolution); } float pixel_ratio = (n_pixels == 0 || (m_train && m_training_step == 0)) ? (1.0f / 256.0f) : ((float)n_pixels / (float)n_pixels_full_res); @@ -2861,7 +2846,7 @@ void Testbed::train_and_render(bool skip_rendering) { factor = 8.f / (float)m_fixed_res_factor; } - factor = tcnn::clamp(factor, 1.0f / 16.0f, 1.0f); + factor = clamp(factor, 1.0f / 16.0f, 1.0f); for (auto&& view : m_views) { if (m_dlss) { @@ -2877,7 +2862,7 @@ void Testbed::train_and_render(bool skip_rendering) { new_render_res = m_camera_path.render_settings.resolution; } - float ratio = std::sqrt((float)compMul(render_res) / (float)compMul(new_render_res)); + float ratio = std::sqrt((float)product(render_res) / (float)product(new_render_res)); if (ratio > 1.2f || ratio < 0.8f || factor == 1.0f || !m_dynamic_res || m_camera_path.rendering) { render_res = new_render_res; } @@ -2902,7 +2887,7 @@ void Testbed::train_and_render(bool skip_rendering) { resolution_scale = clamp(resolution_scale * foveation_begin_factor, vec2(1.0f / m_foveated_rendering_max_scaling), vec2(1.0f)); view.foveation = {resolution_scale, vec2(1.0f) - view.screen_center, vec2(m_foveated_rendering_full_res_diameter * 0.5f)}; - m_foveated_rendering_scaling = 2.0f / compAdd(resolution_scale); + m_foveated_rendering_scaling = 2.0f / sum(resolution_scale); } else { view.foveation = {vec2(1.0f / m_foveated_rendering_scaling), vec2(1.0f) - view.screen_center, vec2(m_foveated_rendering_full_res_diameter * 0.5f)}; } @@ -2954,7 +2939,7 @@ void Testbed::train_and_render(bool skip_rendering) { } if (m_picture_in_picture_res > 0) { - ivec2 res(m_picture_in_picture_res, m_picture_in_picture_res * 9/16); + ivec2 res{(int)m_picture_in_picture_res, (int)(m_picture_in_picture_res * 9.0f / 16.0f)}; m_pip_render_buffer->resize(res); if (m_pip_render_buffer->spp() < 8) { // a bit gross, but let's copy the keyframe's state into the global state in order to not have to plumb through the fov etc to render_frame. @@ -3122,7 +3107,6 @@ void Testbed::init_window(int resw, int resh, bool hidden, bool second_window) { return; } - testbed->redraw_gui_next_frame(); for (int i = 0; i < count; i++) { testbed->load_file(paths[i]); } @@ -3137,7 +3121,11 @@ void Testbed::init_window(int resw, int resh, bool hidden, bool second_window) { glfwSetCursorPosCallback(m_glfw_window, [](GLFWwindow* window, double xpos, double ypos) { Testbed* testbed = (Testbed*)glfwGetWindowUserPointer(window); - if (testbed) { + if ( + testbed && + (ImGui::IsAnyItemActive() || ImGui::GetIO().WantCaptureMouse || ImGuizmo::IsUsing()) && + (ImGui::GetIO().MouseDown[0] || ImGui::GetIO().MouseDown[1] || ImGui::GetIO().MouseDown[2]) + ) { testbed->redraw_gui_next_frame(); } }); @@ -3355,7 +3343,7 @@ bool Testbed::frame() { // Render against the trained neural network. If we're training and already close to convergence, // we can skip rendering if the scene camera doesn't change - uint32_t n_to_skip = m_train ? tcnn::clamp(m_training_step / 16u, 15u, 255u) : 0; + uint32_t n_to_skip = m_train ? clamp(m_training_step / 16u, 15u, 255u) : 0; if (m_render_skip_due_to_lack_of_camera_movement_counter > n_to_skip) { m_render_skip_due_to_lack_of_camera_movement_counter = 0; } @@ -3379,7 +3367,7 @@ bool Testbed::frame() { } #endif - if (!skip_rendering || (std::chrono::steady_clock::now() - m_last_gui_draw_time_point) > 25ms) { + if (!skip_rendering || std::chrono::steady_clock::now() - m_last_gui_draw_time_point > 50ms) { redraw_gui_next_frame(); } @@ -3387,7 +3375,7 @@ bool Testbed::frame() { while (true) { (*m_task_queue.tryPop())(); } - } catch (SharedQueueEmptyException&) {} + } catch (const SharedQueueEmptyException&) {} train_and_render(skip_rendering); @@ -3633,9 +3621,9 @@ void Testbed::reset_network(bool clear_density_grid) { if (m_testbed_mode == ETestbedMode::Nerf) { m_nerf.training.loss_type = string_to_loss_type(loss_config.value("otype", "L2")); - // Some of the Nerf-supported losses are not supported by tcnn::Loss, + // Some of the Nerf-supported losses are not supported by Loss, // so just create a dummy L2 loss there. The NeRF code path will bypass - // the tcnn::Loss in any case. + // the Loss in any case. loss_config["otype"] = "L2"; } @@ -3664,7 +3652,7 @@ void Testbed::reset_network(bool clear_density_grid) { float desired_resolution = 2048.0f; // Desired resolution of the finest hashgrid level over the unit cube if (m_testbed_mode == ETestbedMode::Image) { - desired_resolution = compMax(m_image.resolution) / 2.0f; + desired_resolution = max(m_image.resolution) / 2.0f; } else if (m_testbed_mode == ETestbedMode::Volume) { desired_resolution = m_volume.world2index_scale; } @@ -3686,8 +3674,8 @@ void Testbed::reset_network(bool clear_density_grid) { ; } - m_loss.reset(create_loss(loss_config)); - m_optimizer.reset(create_optimizer(optimizer_config)); + m_loss.reset(create_loss(loss_config)); + m_optimizer.reset(create_optimizer(optimizer_config)); size_t n_encoding_params = 0; if (m_testbed_mode == ETestbedMode::Nerf) { @@ -3696,7 +3684,7 @@ void Testbed::reset_network(bool clear_density_grid) { m_nerf.training.cam_rot_offset.resize(m_nerf.training.dataset.n_images, RotationAdamOptimizer(1e-4f)); m_nerf.training.cam_focal_length_offset = AdamOptimizer(1e-5f); - m_nerf.training.reset_extra_dims(m_rng); + m_nerf.reset_extra_dims(m_rng); json& dir_encoding_config = config["dir_encoding"]; json& rgb_network_config = config["rgb_network"]; @@ -3706,7 +3694,7 @@ void Testbed::reset_network(bool clear_density_grid) { // Instantiate an additional model for each auxiliary GPU for (auto& device : m_devices) { - device.set_nerf_network(std::make_shared>( + device.set_nerf_network(std::make_shared>( dims.n_pos, n_dir_dims, n_extra_dims, @@ -3741,7 +3729,6 @@ void Testbed::reset_network(bool clear_density_grid) { << "]-->" << 3 ; - // Create distortion map model { json& distortion_map_optimizer_config = config.contains("distortion_map") && config["distortion_map"].contains("optimizer") ? config["distortion_map"]["optimizer"] : optimizer_config; @@ -3769,15 +3756,15 @@ void Testbed::reset_network(bool clear_density_grid) { m_sdf.brick_data.free_memory(); } - m_encoding.reset(new TakikawaEncoding( + m_encoding.reset(new TakikawaEncoding( encoding_config["starting_level"], m_sdf.triangle_octree, - tcnn::string_to_interpolation_type(encoding_config.value("interpolation", "linear")) + string_to_interpolation_type(encoding_config.value("interpolation", "linear")) )); m_sdf.uses_takikawa_encoding = true; } else { - m_encoding.reset(create_encoding(dims.n_input, encoding_config)); + m_encoding.reset(create_encoding(dims.n_input, encoding_config)); m_sdf.uses_takikawa_encoding = false; if (m_sdf.octree_depth_target == 0 && encoding_config.contains("n_levels")) { @@ -3786,7 +3773,7 @@ void Testbed::reset_network(bool clear_density_grid) { } for (auto& device : m_devices) { - device.set_network(std::make_shared>(m_encoding, dims.n_output, network_config)); + device.set_network(std::make_shared>(m_encoding, dims.n_output, network_config)); } m_network = primary_device().network(); @@ -3799,14 +3786,15 @@ void Testbed::reset_network(bool clear_density_grid) { << "]-->" << m_encoding->padded_output_width() << "--[" << std::string(network_config["otype"]) << "(neurons=" << (int)network_config["n_neurons"] << ",layers=" << ((int)network_config["n_hidden_layers"]+2) << ")" - << "]-->" << dims.n_output; + << "]-->" << dims.n_output + ; } size_t n_network_params = m_network->n_params() - n_encoding_params; tlog::info() << " total_encoding_params=" << n_encoding_params << " total_network_params=" << n_network_params; - m_trainer = std::make_shared>(m_network, m_optimizer, m_loss, m_seed); + m_trainer = std::make_shared>(m_network, m_optimizer, m_loss, m_seed); m_training_step = 0; m_training_start_time_point = std::chrono::steady_clock::now(); @@ -3831,6 +3819,19 @@ void Testbed::reset_network(bool clear_density_grid) { } Testbed::Testbed(ETestbedMode mode) { + tcnn::set_log_callback([](LogSeverity severity, const std::string& msg) { + tlog::ESeverity s = tlog::ESeverity::Info; + switch (severity) { + case LogSeverity::Info: s = tlog::ESeverity::Info; break; + case LogSeverity::Debug: s = tlog::ESeverity::Debug; break; + case LogSeverity::Warning: s = tlog::ESeverity::Warning; break; + case LogSeverity::Error: s = tlog::ESeverity::Error; break; + case LogSeverity::Success: s = tlog::ESeverity::Success; break; + default: break; + } + tlog::log(s) << msg; + }); + if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) { throw std::runtime_error{"Testbed requires CUDA 10.2 or later."}; } @@ -3866,7 +3867,13 @@ Testbed::Testbed(ETestbedMode mode) { int active_device = cuda_device(); int active_compute_capability = cuda_compute_capability(); - tlog::success() << "Initialized CUDA. Active GPU is #" << active_device << ": " << cuda_device_name() << " [" << active_compute_capability << "]"; + tlog::success() << fmt::format( + "Initialized CUDA {}. Active GPU is #{}: {} [{}]", + cuda_runtime_version_string(), + active_device, + cuda_device_name(), + active_compute_capability + ); if (active_compute_capability < MIN_GPU_ARCH) { tlog::warning() << "Insufficient compute capability " << active_compute_capability << " detected."; @@ -3996,7 +4003,7 @@ void Testbed::train(uint32_t batch_size) { reset_accumulation(false, false); } - uint32_t n_prep_to_skip = m_testbed_mode == ETestbedMode::Nerf ? tcnn::clamp(m_training_step / 16u, 1u, 16u) : 1u; + uint32_t n_prep_to_skip = m_testbed_mode == ETestbedMode::Nerf ? clamp(m_training_step / 16u, 1u, 16u) : 1u; if (m_training_step % n_prep_to_skip == 0) { auto start = std::chrono::steady_clock::now(); ScopeGuard timing_guard{[&]() { @@ -4053,7 +4060,7 @@ vec2 Testbed::calc_focal_length(const ivec2& resolution, const vec2& relative_fo vec2 Testbed::render_screen_center(const vec2& screen_center) const { // see pixel_to_ray for how screen center is used; 0.5, 0.5 is 'normal'. we flip so that it becomes the point in the original image we want to center on. - return (vec2(0.5f) - screen_center) * m_zoom + vec2(0.5f); + return (0.5f - screen_center) * m_zoom + 0.5f; } __global__ void dlss_prep_kernel( @@ -4090,7 +4097,7 @@ __global__ void dlss_prep_kernel( const float depth = depth_buffer[idx]; vec2 mvec = motion_vector( sample_index, - {x, y}, + {(int)x, (int)y}, resolution, focal_length, camera, @@ -4137,7 +4144,7 @@ __global__ void spherical_checkerboard_kernel( Ray ray = pixel_to_ray( 0, - {x, y}, + {(int)x, (int)y}, resolution, focal_length, camera, @@ -4161,7 +4168,7 @@ __global__ void spherical_checkerboard_kernel( // Blend background color on top of checkerboard first (checkerboard is meant to be "behind" the background, // representing transparency), and then blend the result behind the frame buffer. - background_color.rgb = srgb_to_linear(background_color.rgb); + background_color.rgb() = srgb_to_linear(background_color.rgb()); background_color += (1.0f - background_color.a) * checker; uint32_t idx = x + resolution.x * y; @@ -4196,7 +4203,7 @@ __global__ void vr_overlay_hands_kernel( Ray ray = pixel_to_ray( 0, - {x, y}, + {(int)x, (int)y}, resolution, focal_length, camera, @@ -4255,13 +4262,13 @@ __global__ void vr_overlay_hands_kernel( vec4 prev_color; surf2Dread((float4*)&prev_color, surface, x * sizeof(float4), y); if (output_color_space == EColorSpace::SRGB) { - prev_color.rgb = srgb_to_linear(prev_color.rgb); + prev_color.rgb() = srgb_to_linear(prev_color.rgb()); } color += (1.0f - color.a) * prev_color; if (output_color_space == EColorSpace::SRGB) { - color.rgb = linear_to_srgb(color.rgb); + color.rgb() = linear_to_srgb(color.rgb()); } surf2Dwrite(to_float4(color), surface, x * sizeof(float4), y); @@ -4318,7 +4325,7 @@ void Testbed::render_frame_main( switch (m_testbed_mode) { case ETestbedMode::Nerf: if (!m_render_ground_truth || m_ground_truth_alpha < 1.0f) { - render_nerf(device.stream(), device.render_buffer_view(), *device.nerf_network(), device.data().density_grid_bitfield_ptr, focal_length, camera_matrix0, camera_matrix1, nerf_rolling_shutter, screen_center, foveation, visualized_dimension); + render_nerf(device.stream(), device, device.render_buffer_view(), device.nerf_network(), device.data().density_grid_bitfield_ptr, focal_length, camera_matrix0, camera_matrix1, nerf_rolling_shutter, screen_center, foveation, visualized_dimension); } break; case ETestbedMode::Sdf: @@ -4370,7 +4377,7 @@ void Testbed::render_frame_main( ); } } : (distance_fun_t)[&](uint32_t n_elements, const vec3* positions, float* distances, cudaStream_t stream) { - n_elements = next_multiple(n_elements, tcnn::batch_size_granularity); + n_elements = next_multiple(n_elements, BATCH_SIZE_GRANULARITY); GPUMatrix positions_matrix((float*)positions, 3, n_elements); GPUMatrix distances_matrix(distances, 1, n_elements); m_network->inference(stream, positions_matrix, distances_matrix); @@ -4380,7 +4387,7 @@ void Testbed::render_frame_main( m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const vec3* positions, vec3* normals, cudaStream_t stream) { // NO-OP. Normals will automatically be populated by raytrace } : (normals_fun_t)[&](uint32_t n_elements, const vec3* positions, vec3* normals, cudaStream_t stream) { - n_elements = next_multiple(n_elements, tcnn::batch_size_granularity); + n_elements = next_multiple(n_elements, BATCH_SIZE_GRANULARITY); GPUMatrix positions_matrix((float*)positions, 3, n_elements); GPUMatrix normals_matrix((float*)normals, 3, n_elements); m_network->input_gradient(stream, 0, positions_matrix, normals_matrix); @@ -4463,7 +4470,7 @@ void Testbed::render_frame_epilogue( EColorSpace output_color_space = to_srgb ? EColorSpace::SRGB : EColorSpace::Linear; if (m_render_transparency_as_checkerboard) { - mat4x3 checkerboard_transform = mat4x3(1.0f); + mat4x3 checkerboard_transform = mat4x3::identity(); #ifdef NGP_GUI if (m_hmd && m_vr_frame_info && !m_vr_frame_info->views.empty()) { @@ -4582,15 +4589,15 @@ float Testbed::get_depth_from_renderbuffer(const CudaRenderBuffer& render_buffer float depth; auto res = render_buffer.in_resolution(); - ivec2 depth_pixel = clamp(ivec2(uv * vec2(res)), ivec2(0), res - ivec2(1)); + ivec2 depth_pixel = clamp(ivec2(uv * vec2(res)), 0, res - 1); CUDA_CHECK_THROW(cudaMemcpy(&depth, render_buffer.depth_buffer() + depth_pixel.x + depth_pixel.y * res.x, sizeof(float), cudaMemcpyDeviceToHost)); return depth; } -vec3 Testbed::get_3d_pos_from_pixel(const CudaRenderBuffer& render_buffer, const ivec2& pixel) { - float depth = get_depth_from_renderbuffer(render_buffer, vec2(pixel) / vec2(m_window_res)); - auto ray = pixel_to_ray_pinhole(0, pixel, m_window_res, calc_focal_length(m_window_res, m_relative_focal_length, m_fov_axis, m_zoom), m_smoothed_camera, render_screen_center(m_screen_center)); +vec3 Testbed::get_3d_pos_from_pixel(const CudaRenderBuffer& render_buffer, const vec2& pixel) { + float depth = get_depth_from_renderbuffer(render_buffer, pixel / vec2(m_window_res)); + auto ray = pixel_to_ray_pinhole(0, ivec2(pixel), m_window_res, calc_focal_length(m_window_res, m_relative_focal_length, m_fov_axis, m_zoom), m_smoothed_camera, render_screen_center(m_screen_center)); return ray(depth); } @@ -4645,7 +4652,7 @@ void Testbed::gather_histograms() { CUDA_CHECK_THROW(cudaStreamSynchronize(m_stream.get())); - for (int l = 0; l < m_n_levels; ++l) { + for (uint32_t l = 0; l < m_n_levels; ++l) { m_level_stats[l] = compute_level_stats(grid.data() + hg_enc->level_params_offset(l), hg_enc->level_n_params(l)); } @@ -4861,8 +4868,32 @@ void Testbed::load_snapshot(const fs::path& path) { set_all_devices_dirty(); } -void Testbed::CudaDevice::set_nerf_network(const std::shared_ptr>& nerf_network) { - m_network = m_nerf_network = nerf_network; +Testbed::CudaDevice::CudaDevice(int id, bool is_primary) : m_id{id}, m_is_primary{is_primary} { + auto guard = device_guard(); + m_stream = std::make_unique(); + m_data = std::make_unique(); + m_render_worker = std::make_unique(is_primary ? 0u : 1u); +} + +ScopeGuard Testbed::CudaDevice::device_guard() { + int prev_device = cuda_device(); + if (prev_device == m_id) { + return {}; + } + + set_cuda_device(m_id); + return ScopeGuard{[prev_device]() { + set_cuda_device(prev_device); + }}; +} + +void Testbed::CudaDevice::set_network(const std::shared_ptr>& network) { + m_network = network; +} + +void Testbed::CudaDevice::set_nerf_network(const std::shared_ptr>& nerf_network) { + m_nerf_network = nerf_network; + set_network(nerf_network); } void Testbed::sync_device(CudaRenderBuffer& render_buffer, Testbed::CudaDevice& device) { @@ -4904,6 +4935,7 @@ void Testbed::sync_device(CudaRenderBuffer& render_buffer, Testbed::CudaDevice& } device.set_dirty(false); + device.signal(m_stream.get()); } // From https://stackoverflow.com/questions/20843271/passing-a-non-copyable-closure-object-to-stdfunction-parameter @@ -4930,7 +4962,7 @@ ScopeGuard Testbed::use_device(cudaStream_t stream, CudaRenderBuffer& render_buf int active_device = cuda_device(); auto guard = device.device_guard(); - size_t n_pixels = compMul(render_buffer.in_resolution()); + size_t n_pixels = product(render_buffer.in_resolution()); GPUMemoryArena::Allocation alloc; auto scratch = allocate_workspace_and_distribute(device.stream(), &alloc, n_pixels, n_pixels); @@ -4945,8 +4977,8 @@ ScopeGuard Testbed::use_device(cudaStream_t stream, CudaRenderBuffer& render_buf return ScopeGuard{make_copyable_function([&render_buffer, &device, guard=std::move(guard), alloc=std::move(alloc), active_device, stream]() { // Copy device's render buffer's data onto the original render buffer - CUDA_CHECK_THROW(cudaMemcpyPeerAsync(render_buffer.frame_buffer(), active_device, device.render_buffer_view().frame_buffer, device.id(), compMul(render_buffer.in_resolution()) * sizeof(vec4), device.stream())); - CUDA_CHECK_THROW(cudaMemcpyPeerAsync(render_buffer.depth_buffer(), active_device, device.render_buffer_view().depth_buffer, device.id(), compMul(render_buffer.in_resolution()) * sizeof(float), device.stream())); + CUDA_CHECK_THROW(cudaMemcpyPeerAsync(render_buffer.frame_buffer(), active_device, device.render_buffer_view().frame_buffer, device.id(), product(render_buffer.in_resolution()) * sizeof(vec4), device.stream())); + CUDA_CHECK_THROW(cudaMemcpyPeerAsync(render_buffer.depth_buffer(), active_device, device.render_buffer_view().depth_buffer, device.id(), product(render_buffer.in_resolution()) * sizeof(float), device.stream())); device.set_render_buffer_view({}); device.signal(stream); @@ -4960,7 +4992,7 @@ void Testbed::set_all_devices_dirty() { } void Testbed::load_camera_path(const fs::path& path) { - m_camera_path.load(path, mat4x3(1.0f)); + m_camera_path.load(path, mat4x3::identity()); } bool Testbed::loop_animation() { @@ -4971,5 +5003,5 @@ void Testbed::set_loop_animation(bool value) { m_camera_path.loop = value; } -NGP_NAMESPACE_END +} diff --git a/src/testbed_image.cu b/src/testbed_image.cu index bdaee92a3..329ec83f6 100644 --- a/src/testbed_image.cu +++ b/src/testbed_image.cu @@ -26,9 +26,7 @@ #include -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { Testbed::NetworkDims Testbed::network_dims_image() const { NetworkDims dims; @@ -104,7 +102,7 @@ __global__ void init_image_coords( // Hence: generate rays and intersect that plane. Ray ray = pixel_to_ray( sample_index, - {x, y}, + {(int)x, (int)y}, resolution, focal_length, camera_matrix, @@ -128,11 +126,11 @@ __global__ void init_image_coords( return; } - vec2 uv = ray(t).xy; + vec2 uv = ray(t).xy(); // Flip from world coordinates where Y goes up to image coordinates where Y goes down. // Also, multiply the x-axis by the image's aspect ratio to make it have the right proportions. - uv = (uv - vec2(0.5f)) * vec2(aspect, -1.0f) + vec2(0.5f); + uv = (uv - 0.5f) * vec2{aspect, -1.0f} + 0.5f; depth_buffer[idx] = t; positions[idx] = uv; @@ -173,10 +171,10 @@ __global__ void eval_image_kernel_and_snap(uint32_t n_elements, const T* __restr vec2 pos = positions[i]; auto read_val = [&](int x, int y) { - auto val = ((tcnn::vector_t*)texture)[y * resolution.x + x]; - vec4 result{val[0], val[1], val[2], val[3]}; + auto val = ((tvec*)texture)[y * resolution.x + x]; + vec4 result{(float)val[0], (float)val[1], (float)val[2], (float)val[3]}; if (!linear_colors) { - result.rgb = linear_to_srgb(result.rgb); + result.rgb() = linear_to_srgb(result.rgb()); } return result; }; @@ -184,16 +182,16 @@ __global__ void eval_image_kernel_and_snap(uint32_t n_elements, const T* __restr vec4 val; if (snap_to_pixel_centers) { ivec2 pos_int = floor(pos * vec2(resolution)); - positions[i] = (vec2(pos_int) + vec2(0.5f)) / vec2(resolution); - pos_int = clamp(pos_int, ivec2(0), resolution - ivec2(1)); + positions[i] = (vec2(pos_int) + 0.5f) / vec2(resolution); + pos_int = clamp(pos_int, 0, resolution - 1); val = read_val(pos_int.x, pos_int.y); } else { - pos = clamp(pos * vec2(resolution) - vec2(0.5f), vec2(0.0f), vec2(resolution) - vec2(1.0f + 1e-4f)); + pos = clamp(pos * vec2(resolution) - 0.5f, 0.0f, vec2(resolution) - (1.0f + 1e-4f)); const ivec2 pos_int = pos; const vec2 weight = pos - vec2(pos_int); - const ivec2 idx = clamp(pos_int, ivec2(0), resolution - ivec2(2)); + const ivec2 idx = clamp(pos_int, 0, resolution - 2); val = (1 - weight.x) * (1 - weight.y) * read_val(idx.x, idx.y) + @@ -215,11 +213,8 @@ void Testbed::train_image(size_t target_batch_size, bool get_loss_scalar, cudaSt const uint32_t n_output_dims = 3; const uint32_t n_input_dims = 2; - // Auxiliary matrices for training const uint32_t batch_size = (uint32_t)target_batch_size; - // Permute all training records to de-correlate training data - const uint32_t n_elements = batch_size; m_image.training.positions.enlarge(n_elements); m_image.training.targets.enlarge(n_elements); @@ -271,16 +266,11 @@ void Testbed::train_image(size_t target_batch_size, bool get_loss_scalar, cudaSt GPUMatrix training_batch_matrix((float*)(m_image.training.positions.data()), n_input_dims, batch_size); GPUMatrix training_target_matrix((float*)(m_image.training.targets.data()), n_output_dims, batch_size); - - { - auto ctx = m_trainer->training_step(stream, training_batch_matrix, training_target_matrix, nullptr, false); - if (get_loss_scalar) { - m_loss_scalar.update(m_trainer->loss(stream, *ctx)); - } + auto ctx = m_trainer->training_step(stream, training_batch_matrix, training_target_matrix); + if (get_loss_scalar) { + m_loss_scalar.update(m_trainer->loss(stream, *ctx)); } - - m_trainer->optimizer_step(stream, 128); m_training_step++; } @@ -297,7 +287,7 @@ void Testbed::render_image( // Make sure we have enough memory reserved to render at the requested resolution size_t n_pixels = (size_t)res.x * res.y; - uint32_t n_elements = next_multiple((uint32_t)n_pixels, tcnn::batch_size_granularity); + uint32_t n_elements = next_multiple((uint32_t)n_pixels, BATCH_SIZE_GRANULARITY); m_image.render_coords.enlarge(n_elements); m_image.render_out.enlarge(n_elements); @@ -379,7 +369,7 @@ void Testbed::load_image(const fs::path& data_path) { } m_aabb = m_render_aabb = BoundingBox{vec3(0.0f), vec3(1.0f)}; - m_render_aabb_to_local = mat3(1.0f); + m_render_aabb_to_local = mat3::identity(); tlog::success() << "Loaded a " << (m_image.type == EDataType::Half ? "half" : "full") << "-precision image with " @@ -446,7 +436,7 @@ __global__ void image_coords_from_idx(const uint32_t n_elements, uint32_t offset int x = idx % resolution.x; int y = idx / resolution.x; - pos[i] = (vec2(clamp(ivec2{x, y}, ivec2(0), resolution - ivec2(1))) + vec2(0.5f)) / vec2(resolution); + pos[i] = (vec2(clamp(ivec2{x, y}, 0, resolution - 1)) + 0.5f) / vec2(resolution); } __global__ void image_mse_kernel(const uint32_t n_elements, const vec3* __restrict__ target, const vec3* __restrict__ prediction, float* __restrict__ result, bool quantize_to_byte) { @@ -455,7 +445,7 @@ __global__ void image_mse_kernel(const uint32_t n_elements, const vec3* __restri vec3 pred = prediction[i]; if (quantize_to_byte) { - pred = vec3(clamp(ivec3(pred * 255.0f + vec3(0.5f)), ivec3(0), ivec3(255))) / 255.0f; + pred = vec3(clamp(ivec3(pred * 255.0f + 0.5f), 0, 255)) / 255.0f; } const vec3 diff = target[i] - pred; @@ -467,7 +457,7 @@ float Testbed::compute_image_mse(bool quantize_to_byte) { const uint32_t n_input_dims = 2; // Auxiliary matrices for training - const uint32_t n_elements = compMul(m_image.resolution); + const uint32_t n_elements = product(m_image.resolution); const uint32_t max_batch_size = 1u<<20; GPUMemory se(n_elements); @@ -526,4 +516,4 @@ float Testbed::compute_image_mse(bool quantize_to_byte) { return reduce_sum(se.data(), n_elements, nullptr) / n_elements; } -NGP_NAMESPACE_END +} diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu index 2ef067c2a..404512d5e 100644 --- a/src/testbed_nerf.cu +++ b/src/testbed_nerf.cu @@ -41,26 +41,7 @@ #undef copysign #endif -using namespace tcnn; - -NGP_NAMESPACE_BEGIN - -inline constexpr __device__ float NERF_RENDERING_NEAR_DISTANCE() { return 0.05f; } -inline constexpr __device__ uint32_t NERF_STEPS() { return 1024; } // finest number of steps per unit length -inline constexpr __device__ uint32_t NERF_CASCADES() { return 8; } - -inline constexpr __device__ float SQRT3() { return 1.73205080757f; } -inline constexpr __device__ float STEPSIZE() { return (SQRT3() / NERF_STEPS()); } // for nerf raymarch -inline constexpr __device__ float MIN_CONE_STEPSIZE() { return STEPSIZE(); } -// Maximum step size is the width of the coarsest gridsize cell. -inline constexpr __device__ float MAX_CONE_STEPSIZE() { return STEPSIZE() * (1<<(NERF_CASCADES()-1)) * NERF_STEPS() / NERF_GRIDSIZE(); } - -// Used to index into the PRNG stream. Must be larger than the number of -// samples consumed by any given training ray. -inline constexpr __device__ uint32_t N_MAX_RANDOM_SAMPLES_PER_RAY() { return 16; } - -// Any alpha below this is considered "invisible" and is thus culled away. -inline constexpr __device__ float NERF_MIN_OPTICAL_THICKNESS() { return 0.01f; } +namespace ngp { static constexpr uint32_t MARCH_ITER = 10000; @@ -75,336 +56,6 @@ Testbed::NetworkDims Testbed::network_dims_nerf() const { return dims; } -inline __host__ __device__ uint32_t grid_mip_offset(uint32_t mip) { - return NERF_GRID_N_CELLS() * mip; -} - -inline __host__ __device__ float calc_cone_angle(float cosine, const vec2& focal_length, float cone_angle_constant) { - // Pixel size. Doesn't always yield a good performance vs. quality - // trade off. Especially if training pixels have a much different - // size than rendering pixels. - // return cosine*cosine / focal_length.mean(); - - return cone_angle_constant; -} - -inline __host__ __device__ float to_stepping_space(float t, float cone_angle) { - if (cone_angle <= 1e-5f) { - return t / MIN_CONE_STEPSIZE(); - } - - float log1p_c = logf(1.0f + cone_angle); - - float a = (logf(MIN_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c; - float b = (logf(MAX_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c; - - float at = expf(a * log1p_c); - float bt = expf(b * log1p_c); - - if (t <= at) { - return (t - at) / MIN_CONE_STEPSIZE() + a; - } else if (t <= bt) { - return logf(t) / log1p_c; - } else { - return (t - bt) / MAX_CONE_STEPSIZE() + b; - } -} - -inline __host__ __device__ float from_stepping_space(float n, float cone_angle) { - if (cone_angle <= 1e-5f) { - return n * MIN_CONE_STEPSIZE(); - } - - float log1p_c = logf(1.0f + cone_angle); - - float a = (logf(MIN_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c; - float b = (logf(MAX_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c; - - float at = expf(a * log1p_c); - float bt = expf(b * log1p_c); - - if (n <= a) { - return (n - a) * MIN_CONE_STEPSIZE() + at; - } else if (n <= b) { - return expf(n * log1p_c); - } else { - return (n - b) * MAX_CONE_STEPSIZE() + bt; - } -} - -inline __host__ __device__ float advance_n_steps(float t, float cone_angle, float n) { - return from_stepping_space(to_stepping_space(t, cone_angle) + n, cone_angle); -} - -inline __host__ __device__ float calc_dt(float t, float cone_angle) { - return advance_n_steps(t, cone_angle, 1.0f) - t; -} - -struct LossAndGradient { - vec3 loss; - vec3 gradient; - - __host__ __device__ LossAndGradient operator*(float scalar) { - return {loss * scalar, gradient * scalar}; - } - - __host__ __device__ LossAndGradient operator/(float scalar) { - return {loss / scalar, gradient / scalar}; - } -}; - -inline __device__ vec3 copysign(const vec3& a, const vec3& b) { - return { - copysignf(a.x, b.x), - copysignf(a.y, b.y), - copysignf(a.z, b.z), - }; -} - -inline __device__ LossAndGradient l2_loss(const vec3& target, const vec3& prediction) { - vec3 difference = prediction - target; - return { - difference * difference, - 2.0f * difference - }; -} - -inline __device__ LossAndGradient relative_l2_loss(const vec3& target, const vec3& prediction) { - vec3 difference = prediction - target; - vec3 denom = prediction * prediction + vec3(1e-2f); - return { - difference * difference / denom, - 2.0f * difference / denom - }; -} - -inline __device__ LossAndGradient l1_loss(const vec3& target, const vec3& prediction) { - vec3 difference = prediction - target; - return { - abs(difference), - copysign(vec3(1.0f), difference), - }; -} - -inline __device__ LossAndGradient huber_loss(const vec3& target, const vec3& prediction, float alpha = 1) { - vec3 difference = prediction - target; - vec3 abs_diff = abs(difference); - vec3 square = 0.5f/alpha * difference * difference; - return { - { - abs_diff.x > alpha ? (abs_diff.x - 0.5f * alpha) : square.x, - abs_diff.y > alpha ? (abs_diff.y - 0.5f * alpha) : square.y, - abs_diff.z > alpha ? (abs_diff.z - 0.5f * alpha) : square.z, - }, - { - abs_diff.x > alpha ? (difference.x > 0 ? 1.0f : -1.0f) : (difference.x / alpha), - abs_diff.y > alpha ? (difference.y > 0 ? 1.0f : -1.0f) : (difference.y / alpha), - abs_diff.z > alpha ? (difference.z > 0 ? 1.0f : -1.0f) : (difference.z / alpha), - }, - }; -} - -inline __device__ LossAndGradient log_l1_loss(const vec3& target, const vec3& prediction) { - vec3 difference = prediction - target; - vec3 divisor = abs(difference) + vec3(1.0f); - return { - log(divisor), - copysign(vec3(1.0f) / divisor, difference), - }; -} - -inline __device__ LossAndGradient smape_loss(const vec3& target, const vec3& prediction) { - vec3 difference = prediction - target; - vec3 denom = 0.5f * (abs(prediction) + abs(target)) + vec3(1e-2f); - return { - abs(difference) / denom, - copysign(vec3(1.0f) / denom, difference), - }; -} - -inline __device__ LossAndGradient mape_loss(const vec3& target, const vec3& prediction) { - vec3 difference = prediction - target; - vec3 denom = abs(prediction) + vec3(1e-2f); - return { - abs(difference) / denom, - copysign(vec3(1.0f) / denom, difference), - }; -} - -inline __device__ float distance_to_next_voxel(const vec3& pos, const vec3& dir, const vec3& idir, float res) { // dda like step - vec3 p = res * (pos - vec3(0.5f)); - float tx = (floorf(p.x + 0.5f + 0.5f * sign(dir.x)) - p.x) * idir.x; - float ty = (floorf(p.y + 0.5f + 0.5f * sign(dir.y)) - p.y) * idir.y; - float tz = (floorf(p.z + 0.5f + 0.5f * sign(dir.z)) - p.z) * idir.z; - float t = min(min(tx, ty), tz); - - return fmaxf(t / res, 0.0f); -} - -inline __device__ float advance_to_next_voxel(float t, float cone_angle, const vec3& pos, const vec3& dir, const vec3& idir, uint32_t mip) { - float res = scalbnf(NERF_GRIDSIZE(), -(int)mip); - - float t_target = t + distance_to_next_voxel(pos, dir, idir, res); - - // Analytic stepping in multiples of 1 in the "log-space" of our exponential stepping routine - t = to_stepping_space(t, cone_angle); - t_target = to_stepping_space(t_target, cone_angle); - - return from_stepping_space(t + ceilf(fmaxf(t_target - t, 0.5f)), cone_angle); -} - -__device__ float network_to_rgb(float val, ENerfActivation activation) { - switch (activation) { - case ENerfActivation::None: return val; - case ENerfActivation::ReLU: return val > 0.0f ? val : 0.0f; - case ENerfActivation::Logistic: return tcnn::logistic(val); - case ENerfActivation::Exponential: return __expf(tcnn::clamp(val, -10.0f, 10.0f)); - default: assert(false); - } - return 0.0f; -} - -__device__ float network_to_rgb_derivative(float val, ENerfActivation activation) { - switch (activation) { - case ENerfActivation::None: return 1.0f; - case ENerfActivation::ReLU: return val > 0.0f ? 1.0f : 0.0f; - case ENerfActivation::Logistic: { float density = tcnn::logistic(val); return density * (1 - density); }; - case ENerfActivation::Exponential: return __expf(tcnn::clamp(val, -10.0f, 10.0f)); - default: assert(false); - } - return 0.0f; -} - -template -__device__ vec3 network_to_rgb_derivative_vec(const T& val, ENerfActivation activation) { - return { - network_to_rgb_derivative(float(val[0]), activation), - network_to_rgb_derivative(float(val[1]), activation), - network_to_rgb_derivative(float(val[2]), activation), - }; -} - -__device__ float network_to_density(float val, ENerfActivation activation) { - switch (activation) { - case ENerfActivation::None: return val; - case ENerfActivation::ReLU: return val > 0.0f ? val : 0.0f; - case ENerfActivation::Logistic: return tcnn::logistic(val); - case ENerfActivation::Exponential: return __expf(val); - default: assert(false); - } - return 0.0f; -} - -__device__ float network_to_density_derivative(float val, ENerfActivation activation) { - switch (activation) { - case ENerfActivation::None: return 1.0f; - case ENerfActivation::ReLU: return val > 0.0f ? 1.0f : 0.0f; - case ENerfActivation::Logistic: { float density = tcnn::logistic(val); return density * (1 - density); }; - case ENerfActivation::Exponential: return __expf(tcnn::clamp(val, -15.0f, 15.0f)); - default: assert(false); - } - return 0.0f; -} - -template -__device__ vec3 network_to_rgb_vec(const T& val, ENerfActivation activation) { - return { - network_to_rgb(float(val[0]), activation), - network_to_rgb(float(val[1]), activation), - network_to_rgb(float(val[2]), activation), - }; -} - -__device__ vec3 warp_position(const vec3& pos, const BoundingBox& aabb) { - // return {tcnn::logistic(pos.x - 0.5f), tcnn::logistic(pos.y - 0.5f), tcnn::logistic(pos.z - 0.5f)}; - // return pos; - - return aabb.relative_pos(pos); -} - -__device__ vec3 unwarp_position(const vec3& pos, const BoundingBox& aabb) { - // return {logit(pos.x) + 0.5f, logit(pos.y) + 0.5f, logit(pos.z) + 0.5f}; - // return pos; - - return aabb.min + pos * aabb.diag(); -} - -__device__ vec3 unwarp_position_derivative(const vec3& pos, const BoundingBox& aabb) { - // return {logit(pos.x) + 0.5f, logit(pos.y) + 0.5f, logit(pos.z) + 0.5f}; - // return pos; - - return aabb.diag(); -} - -__device__ vec3 warp_position_derivative(const vec3& pos, const BoundingBox& aabb) { - return vec3(1.0f) / unwarp_position_derivative(pos, aabb); -} - -__host__ __device__ vec3 warp_direction(const vec3& dir) { - return (dir + vec3(1.0f)) * 0.5f; -} - -__device__ vec3 unwarp_direction(const vec3& dir) { - return dir * 2.0f - vec3(1.0f); -} - -__device__ vec3 warp_direction_derivative(const vec3& dir) { - return vec3(0.5f); -} - -__device__ vec3 unwarp_direction_derivative(const vec3& dir) { - return vec3(2.0f); -} - -__device__ float warp_dt(float dt) { - float max_stepsize = MIN_CONE_STEPSIZE() * (1<<(NERF_CASCADES()-1)); - return (dt - MIN_CONE_STEPSIZE()) / (max_stepsize - MIN_CONE_STEPSIZE()); -} - -__device__ float unwarp_dt(float dt) { - float max_stepsize = MIN_CONE_STEPSIZE() * (1<<(NERF_CASCADES()-1)); - return dt * (max_stepsize - MIN_CONE_STEPSIZE()) + MIN_CONE_STEPSIZE(); -} - -__device__ uint32_t cascaded_grid_idx_at(vec3 pos, uint32_t mip) { - float mip_scale = scalbnf(1.0f, -mip); - pos -= vec3(0.5f); - pos *= mip_scale; - pos += vec3(0.5f); - - ivec3 i = pos * (float)NERF_GRIDSIZE(); - if (i.x < 0 || i.x >= NERF_GRIDSIZE() || i.y < 0 || i.y >= NERF_GRIDSIZE() || i.z < 0 || i.z >= NERF_GRIDSIZE()) { - return 0xFFFFFFFF; - } - - return tcnn::morton3D(i.x, i.y, i.z); -} - -__device__ bool density_grid_occupied_at(const vec3& pos, const uint8_t* density_grid_bitfield, uint32_t mip) { - uint32_t idx = cascaded_grid_idx_at(pos, mip); - if (idx == 0xFFFFFFFF) { - return false; - } - return density_grid_bitfield[idx/8+grid_mip_offset(mip)/8] & (1<<(idx%8)); -} - -__device__ float cascaded_grid_at(vec3 pos, const float* cascaded_grid, uint32_t mip) { - uint32_t idx = cascaded_grid_idx_at(pos, mip); - if (idx == 0xFFFFFFFF) { - return 0.0f; - } - return cascaded_grid[idx+grid_mip_offset(mip)]; -} - -__device__ float& cascaded_grid_at(vec3 pos, float* cascaded_grid, uint32_t mip) { - uint32_t idx = cascaded_grid_idx_at(pos, mip); - if (idx == 0xFFFFFFFF) { - idx = 0; - printf("WARNING: invalid cascaded grid access."); - } - return cascaded_grid[idx+grid_mip_offset(mip)]; -} - __global__ void extract_srgb_with_activation(const uint32_t n_elements, const uint32_t rgb_stride, const float* __restrict__ rgbd, float* __restrict__ rgb, ENerfActivation rgb_activation, bool from_linear) { const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n_elements) return; @@ -432,12 +83,12 @@ __global__ void mark_untrained_density_grid(const uint32_t n_elements, float* _ uint32_t level = i / NERF_GRID_N_CELLS(); uint32_t pos_idx = i % NERF_GRID_N_CELLS(); - uint32_t x = tcnn::morton3D_invert(pos_idx>>0); - uint32_t y = tcnn::morton3D_invert(pos_idx>>1); - uint32_t z = tcnn::morton3D_invert(pos_idx>>2); + uint32_t x = morton3D_invert(pos_idx>>0); + uint32_t y = morton3D_invert(pos_idx>>1); + uint32_t z = morton3D_invert(pos_idx>>2); float voxel_size = scalbnf(1.0f / NERF_GRIDSIZE(), level); - vec3 pos = (vec3{(float)x, (float)y, (float)z} / (float)NERF_GRIDSIZE() - vec3(0.5f)) * scalbnf(1.0f, level) + vec3(0.5f); + vec3 pos = (vec3{(float)x, (float)y, (float)z} / (float)NERF_GRIDSIZE() - 0.5f) * scalbnf(1.0f, level) + 0.5f; vec3 corners[8] = { pos + vec3{0.0f, 0.0f, 0.0f }, @@ -503,7 +154,7 @@ __global__ void generate_grid_samples_nerf_uniform(ivec3 res_3d, const uint32_t } uint32_t i = x + y * res_3d.x + z * res_3d.x * res_3d.y; - vec3 pos = vec3{(float)x, (float)y, (float)z} / vec3(res_3d - ivec3(1)); + vec3 pos = vec3{(float)x, (float)y, (float)z} / vec3(res_3d - 1); pos = transpose(render_aabb_to_local) * (pos * (render_aabb.max - render_aabb.min) + render_aabb.min); out[i] = { warp_position(pos, train_aabb), warp_dt(MIN_CONE_STEPSIZE()) }; } @@ -523,7 +174,7 @@ __global__ void generate_grid_samples_nerf_uniform_dir(ivec3 res_3d, const uint3 if (voxel_centers) { pos = vec3{(float)x + 0.5f, (float)y + 0.5f, (float)z + 0.5f} / vec3(res_3d); } else { - pos = vec3{(float)x, (float)y, (float)z} / vec3(res_3d - ivec3(1)); + pos = vec3{(float)x, (float)y, (float)z} / vec3(res_3d - 1); } pos = transpose(render_aabb_to_local) * (pos * (render_aabb.max - render_aabb.min) + render_aabb.min); @@ -531,25 +182,6 @@ __global__ void generate_grid_samples_nerf_uniform_dir(ivec3 res_3d, const uint3 network_input(i)->set_with_optional_extra_dims(warp_position(pos, train_aabb), warp_direction(ray_dir), warp_dt(MIN_CONE_STEPSIZE()), extra_dims, network_input.stride_in_bytes); } -inline __device__ uint32_t mip_from_pos(const vec3& pos, uint32_t max_cascade = NERF_CASCADES()-1) { - int exponent; - float maxval = compMax(abs(pos - vec3(0.5f))); - frexpf(maxval, &exponent); - return (uint32_t)tcnn::clamp(exponent+1, 0, (int)max_cascade); -} - -inline __device__ uint32_t mip_from_dt(float dt, const vec3& pos, uint32_t max_cascade = NERF_CASCADES()-1) { - uint32_t mip = mip_from_pos(pos, max_cascade); - dt *= 2 * NERF_GRIDSIZE(); - if (dt < 1.0f) { - return mip; - } - - int exponent; - frexpf(dt, &exponent); - return (uint32_t)tcnn::clamp((int)mip, exponent, (int)max_cascade); -} - __global__ void generate_grid_samples_nerf_nonuniform(const uint32_t n_elements, default_rng_t rng, const uint32_t step, BoundingBox aabb, const float* __restrict__ grid_in, NerfPosition* __restrict__ out, uint32_t* __restrict__ indices, uint32_t n_cascades, float thresh) { const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n_elements) return; @@ -571,17 +203,17 @@ __global__ void generate_grid_samples_nerf_nonuniform(const uint32_t n_elements, // Random position within that cellq uint32_t pos_idx = idx % NERF_GRID_N_CELLS(); - uint32_t x = tcnn::morton3D_invert(pos_idx>>0); - uint32_t y = tcnn::morton3D_invert(pos_idx>>1); - uint32_t z = tcnn::morton3D_invert(pos_idx>>2); + uint32_t x = morton3D_invert(pos_idx>>0); + uint32_t y = morton3D_invert(pos_idx>>1); + uint32_t z = morton3D_invert(pos_idx>>2); - vec3 pos = ((vec3{(float)x, (float)y, (float)z} + random_val_3d(rng)) / (float)NERF_GRIDSIZE() - vec3(0.5f)) * scalbnf(1.0f, level) + vec3(0.5f); + vec3 pos = ((vec3{(float)x, (float)y, (float)z} + random_val_3d(rng)) / (float)NERF_GRIDSIZE() - 0.5f) * scalbnf(1.0f, level) + 0.5f; out[i] = { warp_position(pos, aabb), warp_dt(MIN_CONE_STEPSIZE()) }; indices[i] = idx; } -__global__ void splat_grid_samples_nerf_max_nearest_neighbor(const uint32_t n_elements, const uint32_t* __restrict__ indices, const tcnn::network_precision_t* network_output, float* __restrict__ grid_out, ENerfActivation rgb_activation, ENerfActivation density_activation) { +__global__ void splat_grid_samples_nerf_max_nearest_neighbor(const uint32_t n_elements, const uint32_t* __restrict__ indices, const network_precision_t* network_output, float* __restrict__ grid_out, ENerfActivation rgb_activation, ENerfActivation density_activation) { const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n_elements) return; @@ -599,7 +231,7 @@ __global__ void splat_grid_samples_nerf_max_nearest_neighbor(const uint32_t n_el atomicMax((uint32_t*)&grid_out[local_idx], __float_as_uint(optical_thickness)); } -__global__ void grid_samples_half_to_float(const uint32_t n_elements, BoundingBox aabb, float* dst, const tcnn::network_precision_t* network_output, ENerfActivation density_activation, const NerfPosition* __restrict__ coords_in, const float* __restrict__ grid_in, uint32_t max_cascade) { +__global__ void grid_samples_half_to_float(const uint32_t n_elements, BoundingBox aabb, float* dst, const network_precision_t* network_output, ENerfActivation density_activation, const NerfPosition* __restrict__ coords_in, const float* __restrict__ grid_in, uint32_t max_cascade) { const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n_elements) return; @@ -691,46 +323,11 @@ __global__ void bitfield_max_pool(const uint32_t n_elements, bits |= prev_level[i*8+j] > 0 ? ((uint8_t)1 << j) : 0; } - uint32_t x = tcnn::morton3D_invert(i>>0) + NERF_GRIDSIZE()/8; - uint32_t y = tcnn::morton3D_invert(i>>1) + NERF_GRIDSIZE()/8; - uint32_t z = tcnn::morton3D_invert(i>>2) + NERF_GRIDSIZE()/8; - - next_level[tcnn::morton3D(x, y, z)] |= bits; -} - -template -__device__ float if_unoccupied_advance_to_next_occupied_voxel( - float t, - float cone_angle, - const Ray& ray, - const vec3& idir, - const uint8_t* __restrict__ density_grid, - uint32_t min_mip, - uint32_t max_mip, - BoundingBox aabb, - mat3 aabb_to_local = mat3(1.0f) -) { - while (true) { - vec3 pos = ray(t); - if (t >= MAX_DEPTH() || !aabb.contains(aabb_to_local * pos)) { - return MAX_DEPTH(); - } - - uint32_t mip = tcnn::clamp(MIP_FROM_DT ? mip_from_dt(calc_dt(t, cone_angle), pos) : mip_from_pos(pos), min_mip, max_mip); - - if (!density_grid || density_grid_occupied_at(pos, density_grid, mip)) { - return t; - } + uint32_t x = morton3D_invert(i>>0) + NERF_GRIDSIZE()/8; + uint32_t y = morton3D_invert(i>>1) + NERF_GRIDSIZE()/8; + uint32_t z = morton3D_invert(i>>2) + NERF_GRIDSIZE()/8; - // Find largest empty voxel surrounding us, such that we can advance as far as possible in the next step. - // Other places that do voxel stepping don't need this, because they don't rely on thread coherence as - // much as this one here. - while (mip < max_mip && !density_grid_occupied_at(pos, density_grid, mip+1)) { - ++mip; - } - - t = advance_to_next_voxel(t, cone_angle, pos, ray.d, idir, mip); - } + next_level[morton3D(x, y, z)] |= bits; } __device__ void advance_pos_nerf( @@ -787,7 +384,7 @@ __global__ void generate_nerf_network_inputs_from_positions(const uint32_t n_ele const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; if (i >= n_elements) return; - vec3 dir = normalize(pos[i] - vec3(0.5f)); // choose outward pointing directions, for want of a better choice + vec3 dir = normalize(pos[i] - 0.5f); // choose outward pointing directions, for want of a better choice network_input(i)->set_with_optional_extra_dims(warp_position(pos[i], aabb), warp_direction(dir), warp_dt(MIN_CONE_STEPSIZE()), extra_dims, network_input.stride_in_bytes); } @@ -807,10 +404,10 @@ __device__ vec4 compute_nerf_rgba(const vec4& network_output, ENerfActivation rg if (density_as_alpha) { rgba.a = density; } else { - rgba.a = alpha = tcnn::clamp(1.f - __expf(-density * depth), 0.0f, 1.0f); + rgba.a = alpha = clamp(1.f - __expf(-density * depth), 0.0f, 1.0f); } - rgba.rgb = network_to_rgb_vec(rgba.rgb, rgb_activation) * alpha; + rgba.rgb() = network_to_rgb_vec(rgba.rgb(), rgb_activation) * alpha; return rgba; } @@ -884,7 +481,7 @@ __global__ void composite_kernel_nerf( float* __restrict__ depth, NerfPayload* payloads, PitchedPtr network_input, - const tcnn::network_precision_t* __restrict__ network_output, + const network_precision_t* __restrict__ network_output, uint32_t padded_output_width, uint32_t n_steps, ERenderMode render_mode, @@ -912,7 +509,7 @@ __global__ void composite_kernel_nerf( uint32_t j = 0; for (; j < actual_n_steps; ++j) { - tcnn::vector_t local_network_output; + tvec local_network_output; local_network_output[0] = network_output[i + j * n_elements + 0 * stride]; local_network_output[1] = network_output[i + j * n_elements + 1 * stride]; local_network_output[2] = network_output[i + j * n_elements + 2 * stride]; @@ -1036,19 +633,7 @@ __global__ void composite_kernel_nerf( vec3 normal = -network_to_density_derivative(float(local_network_output[3]), density_activation) * warped_pos; rgb = normalize(normal); } else if (render_mode == ERenderMode::Positions) { - if (show_accel >= 0) { - uint32_t mip = max(show_accel, mip_from_pos(pos)); - uint32_t res = NERF_GRIDSIZE() >> mip; - int ix = pos.x * res; - int iy = pos.y * res; - int iz = pos.z * res; - default_rng_t rng(ix + iy * 232323 + iz * 727272); - rgb.x = 1.f - mip * (1.f / (NERF_CASCADES() - 1)); - rgb.y = rng.next_float(); - rgb.z = rng.next_float(); - } else { - rgb = (pos - vec3(0.5f)) / 2.0f + vec3(0.5f); - } + rgb = (pos - 0.5f) / 2.0f + 0.5f; } else if (render_mode == ERenderMode::EncodingVis) { rgb = warped_pos; } else if (render_mode == ERenderMode::Depth) { @@ -1057,6 +642,18 @@ __global__ void composite_kernel_nerf( rgb = vec3(alpha); } + if (show_accel >= 0) { + uint32_t mip = max((uint32_t)show_accel, mip_from_pos(pos)); + uint32_t res = NERF_GRIDSIZE() >> mip; + int ix = pos.x * res; + int iy = pos.y * res; + int iz = pos.z * res; + default_rng_t rng(ix + iy * 232323 + iz * 727272); + rgb.x = 1.f - mip * (1.f / (NERF_CASCADES() - 1)); + rgb.y = rng.next_float(); + rgb.z = rng.next_float(); + } + local_rgba += vec4(rgb * weight, weight); if (weight > payload.max_weight) { payload.max_weight = weight; @@ -1078,119 +675,6 @@ __global__ void composite_kernel_nerf( depth[i] = local_depth; } -static constexpr float UNIFORM_SAMPLING_FRACTION = 0.5f; - -inline __device__ vec2 sample_cdf_2d(vec2 sample, uint32_t img, const ivec2& res, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y, float* __restrict__ pdf) { - if (sample.x < UNIFORM_SAMPLING_FRACTION) { - sample.x /= UNIFORM_SAMPLING_FRACTION; - return sample; - } - - sample.x = (sample.x - UNIFORM_SAMPLING_FRACTION) / (1.0f - UNIFORM_SAMPLING_FRACTION); - - cdf_y += img * res.y; - - // First select row according to cdf_y - uint32_t y = binary_search(sample.y, cdf_y, res.y); - float prev = y > 0 ? cdf_y[y-1] : 0.0f; - float pmf_y = cdf_y[y] - prev; - sample.y = (sample.y - prev) / pmf_y; - - cdf_x_cond_y += img * res.y * res.x + y * res.x; - - // Then, select col according to x - uint32_t x = binary_search(sample.x, cdf_x_cond_y, res.x); - prev = x > 0 ? cdf_x_cond_y[x-1] : 0.0f; - float pmf_x = cdf_x_cond_y[x] - prev; - sample.x = (sample.x - prev) / pmf_x; - - if (pdf) { - *pdf = pmf_x * pmf_y * compMul(res); - } - - return {((float)x + sample.x) / (float)res.x, ((float)y + sample.y) / (float)res.y}; -} - -inline __device__ float pdf_2d(vec2 sample, uint32_t img, const ivec2& res, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y) { - ivec2 p = clamp(ivec2(sample * vec2(res)), ivec2(0), res - ivec2(1)); - - cdf_y += img * res.y; - cdf_x_cond_y += img * res.y * res.x + p.y * res.x; - - float pmf_y = cdf_y[p.y]; - if (p.y > 0) { - pmf_y -= cdf_y[p.y-1]; - } - - float pmf_x = cdf_x_cond_y[p.x]; - if (p.x > 0) { - pmf_x -= cdf_x_cond_y[p.x-1]; - } - - // Probability mass of picking the pixel - float pmf = pmf_x * pmf_y; - - // To convert to probability density, divide by area of pixel - return UNIFORM_SAMPLING_FRACTION + pmf * compMul(res) * (1.0f - UNIFORM_SAMPLING_FRACTION); -} - -inline __device__ vec2 nerf_random_image_pos_training(default_rng_t& rng, const ivec2& resolution, bool snap_to_pixel_centers, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y, const ivec2& cdf_res, uint32_t img, float* __restrict__ pdf = nullptr) { - vec2 uv = random_val_2d(rng); - - if (cdf_x_cond_y) { - uv = sample_cdf_2d(uv, img, cdf_res, cdf_x_cond_y, cdf_y, pdf); - } else if (pdf) { - *pdf = 1.0f; - } - - if (snap_to_pixel_centers) { - uv = (vec2(clamp(ivec2(uv * vec2(resolution)), ivec2(0), resolution - ivec2(1))) + vec2(0.5f)) / vec2(resolution); - } - - return uv; -} - -inline __device__ uint32_t image_idx(uint32_t base_idx, uint32_t n_rays, uint32_t n_rays_total, uint32_t n_training_images, const float* __restrict__ cdf = nullptr, float* __restrict__ pdf = nullptr) { - if (cdf) { - float sample = ld_random_val(base_idx/* + n_rays_total*/, 0xdeadbeef); - // float sample = random_val(base_idx/* + n_rays_total*/); - uint32_t img = binary_search(sample, cdf, n_training_images); - - if (pdf) { - float prev = img > 0 ? cdf[img-1] : 0.0f; - *pdf = (cdf[img] - prev) * n_training_images; - } - - return img; - } - - // return ((base_idx/* + n_rays_total*/) * 56924617 + 96925573) % n_training_images; - - // Neighboring threads in the warp process the same image. Increases locality. - if (pdf) { - *pdf = 1.0f; - } - return (((base_idx/* + n_rays_total*/) * n_training_images) / n_rays) % n_training_images; -} - -__device__ LossAndGradient loss_and_gradient(const vec3& target, const vec3& prediction, ELossType loss_type) { - switch (loss_type) { - case ELossType::RelativeL2: return relative_l2_loss(target, prediction); break; - case ELossType::L1: return l1_loss(target, prediction); break; - case ELossType::Mape: return mape_loss(target, prediction); break; - case ELossType::Smape: return smape_loss(target, prediction); break; - // Note: we divide the huber loss by a factor of 5 such that its L2 region near zero - // matches with the L2 loss and error numbers become more comparable. This allows reading - // off dB numbers of ~converged models and treating them as approximate PSNR to compare - // with other NeRF methods. Self-normalizing optimizers such as Adam are agnostic to such - // constant factors; optimization is therefore unaffected. - case ELossType::Huber: return huber_loss(target, prediction, 0.1f) / 5.0f; break; - case ELossType::LogL1: return log_l1_loss(target, prediction); break; - default: case ELossType::L2: return l2_loss(target, prediction); break; - } -} - - __global__ void generate_training_samples_nerf( const uint32_t n_rays, BoundingBox aabb, @@ -1366,14 +850,14 @@ __global__ void compute_loss_kernel_train_nerf( bool train_in_linear_colors, const uint32_t n_training_images, const TrainingImageMetadata* __restrict__ metadata, - const tcnn::network_precision_t* network_output, + const network_precision_t* network_output, uint32_t* __restrict__ numsteps_counter, const uint32_t* __restrict__ ray_indices_in, const Ray* __restrict__ rays_in_unnormalized, uint32_t* __restrict__ numsteps_in, PitchedPtr coords_in, PitchedPtr coords_out, - tcnn::network_precision_t* dloss_doutput, + network_precision_t* dloss_doutput, ELossType loss_type, ELossType depth_loss_type, float* __restrict__ loss_output, @@ -1424,7 +908,7 @@ __global__ void compute_loss_kernel_train_nerf( break; } - const tcnn::vector_t local_network_output = *(tcnn::vector_t*)network_output; + const tvec local_network_output = *(tvec*)network_output; const vec3 rgb = network_to_rgb_vec(local_network_output, rgb_activation); const vec3 pos = unwarp_position(coords_in.ptr->pos.p, aabb); const float dt = unwarp_dt(coords_in.ptr->dt); @@ -1469,7 +953,7 @@ __global__ void compute_loss_kernel_train_nerf( if (envmap) { dir = normalize(rays_in_unnormalized[i].d); envmap_value = read_envmap(envmap, dir); - background_color = envmap_value.rgb + background_color * (1.0f - envmap_value.a); + background_color = envmap_value.rgb() + background_color * (1.0f - envmap_value.a); } vec3 exposure_scale = exp(0.6931471805599453f * exposure[img]); @@ -1479,7 +963,7 @@ __global__ void compute_loss_kernel_train_nerf( vec3 rgbtarget; if (train_in_linear_colors || color_space == EColorSpace::Linear) { - rgbtarget = exposure_scale * texsamp.rgb + (1.0f - texsamp.a) * background_color; + rgbtarget = exposure_scale * texsamp.rgb() + (1.0f - texsamp.a) * background_color; if (!train_in_linear_colors) { rgbtarget = linear_to_srgb(rgbtarget); @@ -1488,7 +972,7 @@ __global__ void compute_loss_kernel_train_nerf( } else if (color_space == EColorSpace::SRGB) { background_color = linear_to_srgb(background_color); if (texsamp.a > 0) { - rgbtarget = linear_to_srgb(exposure_scale * texsamp.rgb / texsamp.a) * texsamp.a + (1.0f - texsamp.a) * background_color; + rgbtarget = linear_to_srgb(exposure_scale * texsamp.rgb() / texsamp.a) * texsamp.a + (1.0f - texsamp.a) * background_color; } else { rgbtarget = background_color; } @@ -1529,25 +1013,25 @@ __global__ void compute_loss_kernel_train_nerf( // to change the weighting of the loss function. So don't divide. // lg.gradient /= img_pdf * uv_pdf; - float mean_loss = compAdd(lg.loss) / 3.0f; + float mean_loss = mean(lg.loss); if (loss_output) { loss_output[i] = mean_loss / (float)n_rays; } if (error_map) { - const vec2 pos = clamp(uv * vec2(error_map_res) - vec2(0.5f), vec2(0.0f), vec2(error_map_res) - vec2(1.0f + 1e-4f)); + const vec2 pos = clamp(uv * vec2(error_map_res) - 0.5f, 0.0f, vec2(error_map_res) - (1.0f + 1e-4f)); const ivec2 pos_int = pos; const vec2 weight = pos - vec2(pos_int); - ivec2 idx = clamp(pos_int, ivec2(0), resolution - ivec2(2)); + ivec2 idx = clamp(pos_int, 0, resolution - 2); auto deposit_val = [&](int x, int y, float val) { - atomicAdd(&error_map[img * compMul(error_map_res) + y * error_map_res.x + x], val); + atomicAdd(&error_map[img * product(error_map_res) + y * error_map_res.x + x], val); }; if (sharpness_data && aabb.contains(hitpoint)) { - ivec2 sharpness_pos = clamp(ivec2(uv * vec2(sharpness_resolution)), ivec2(0), sharpness_resolution - ivec2(1)); - float sharp = sharpness_data[img * compMul(sharpness_resolution) + sharpness_pos.y * sharpness_resolution.x + sharpness_pos.x] + 1e-6f; + ivec2 sharpness_pos = clamp(ivec2(uv * vec2(sharpness_resolution)), 0, sharpness_resolution - 1); + float sharp = sharpness_data[img * product(sharpness_resolution) + sharpness_pos.y * sharpness_resolution.x + sharpness_pos.x] + 1e-6f; // The maximum value of positive floats interpreted in uint format is the same as the maximum value of the floats. float grid_sharp = __uint_as_float(atomicMax((uint32_t*)&cascaded_grid_at(hitpoint, sharpness_grid, mip_from_pos(hitpoint, max_mip)), __float_as_uint(sharp))); @@ -1584,7 +1068,7 @@ __global__ void compute_loss_kernel_train_nerf( float depth = distance(pos, ray_o); float dt = unwarp_dt(coord_in->dt); - const tcnn::vector_t local_network_output = *(tcnn::vector_t*)network_output; + const tvec local_network_output = *(tvec*)network_output; const vec3 rgb = network_to_rgb_vec(local_network_output, rgb_activation); const float density = network_to_density(float(local_network_output[3]), density_activation); const float alpha = 1.f - __expf(-density * dt); @@ -1597,7 +1081,7 @@ __global__ void compute_loss_kernel_train_nerf( const vec3 suffix = rgb_ray - rgb_ray2; const vec3 dloss_by_drgb = weight * lg.gradient; - tcnn::vector_t local_dL_doutput; + tvec local_dL_doutput; // chain rule to go from dloss/drgb to dloss/dmlp_output local_dL_doutput[0] = loss_scale * (dloss_by_drgb.x * network_to_rgb_derivative(local_network_output[0], rgb_activation) + fmaxf(0.0f, output_l2_reg * (float)local_network_output[0])); // Penalize way too large color values @@ -1621,7 +1105,7 @@ __global__ void compute_loss_kernel_train_nerf( (float(local_network_output[3]) > -10.0f && depth < near_distance ? 1e-4f : 0.0f); ; - *(tcnn::vector_t*)dloss_doutput = local_dL_doutput; + *(tvec*)dloss_doutput = local_dL_doutput; dloss_doutput += padded_output_width; network_output += padded_output_width; @@ -1653,7 +1137,7 @@ __global__ void compute_loss_kernel_train_nerf( dloss_by_dbackground /= srgb_to_linear_derivative(background_color); } - tcnn::vector_t dL_denvmap; + tvec dL_denvmap; dL_denvmap[0] = loss_scale * dloss_by_dbackground.x; dL_denvmap[1] = loss_scale * dloss_by_dbackground.y; dL_denvmap[2] = loss_scale * dloss_by_dbackground.z; @@ -1662,7 +1146,7 @@ __global__ void compute_loss_kernel_train_nerf( float dloss_by_denvmap_alpha = -dot(dloss_by_dbackground, pre_envmap_background_color); // dL_denvmap[3] = loss_scale * dloss_by_denvmap_alpha; - dL_denvmap[3] = (tcnn::network_precision_t)0; + dL_denvmap[3] = (network_precision_t)0; deposit_envmap_gradient(dL_denvmap, envmap_gradient, envmap_resolution, dir); } @@ -1817,6 +1301,9 @@ __global__ void compute_extra_dims_gradient_train_nerf( __global__ void shade_kernel_nerf( const uint32_t n_elements, + bool gbuffer_hard_edges, + mat4x3 camera_matrix, + float depth_scale, vec4* __restrict__ rgba, float* __restrict__ depth, NerfPayload* __restrict__ payloads, @@ -1826,21 +1313,26 @@ __global__ void shade_kernel_nerf( float* __restrict__ depth_buffer ) { const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x; - if (i >= n_elements) return; + if (i >= n_elements || render_mode == ERenderMode::Distortion) return; NerfPayload& payload = payloads[i]; vec4 tmp = rgba[i]; if (render_mode == ERenderMode::Normals) { vec3 n = normalize(tmp.xyz()); - tmp.rgb = (0.5f * n + vec3(0.5f)) * tmp.a; + tmp.rgb() = (0.5f * n + 0.5f) * tmp.a; } else if (render_mode == ERenderMode::Cost) { float col = (float)payload.n_steps / 128; tmp = {col, col, col, 1.0f}; + } else if (gbuffer_hard_edges && render_mode == ERenderMode::Depth) { + tmp.rgb() = vec3(depth[i] * depth_scale); + } else if (gbuffer_hard_edges && render_mode == ERenderMode::Positions) { + vec3 pos = camera_matrix[3] + payload.dir / dot(payload.dir, camera_matrix[2]) * depth[i]; + tmp.rgb() = (pos - 0.5f) / 2.0f + 0.5f; } if (!train_in_linear_colors && (render_mode == ERenderMode::Shade || render_mode == ERenderMode::Slice)) { // Accumulate in linear colors - tmp.rgb = srgb_to_linear(tmp.rgb); + tmp.rgb() = srgb_to_linear(tmp.rgb()); } frame_buffer[payload.idx] = tmp + frame_buffer[payload.idx] * (1.0f - tmp.a); @@ -1914,13 +1406,14 @@ __global__ void init_rays_with_payload_kernel_nerf( vec2 pixel_offset = ld_random_pixel_offset(snap_to_pixel_centers ? 0 : sample_index); vec2 uv = vec2{(float)x + pixel_offset.x, (float)y + pixel_offset.y} / vec2(resolution); - float ray_time = rolling_shutter.x + rolling_shutter.y * uv.x + rolling_shutter.z * uv.y + rolling_shutter.w * ld_random_val(sample_index, idx * 72239731); + mat4x3 camera = get_xform_given_rolling_shutter({camera_matrix0, camera_matrix1}, rolling_shutter, uv, ld_random_val(sample_index, idx * 72239731)); + Ray ray = uv_to_ray( sample_index, uv, resolution, focal_length, - camera_matrix0 * ray_time + camera_matrix1 * (1.f - ray_time), + camera, screen_center, parallax_shift, near_distance, @@ -1955,6 +1448,17 @@ __global__ void init_rays_with_payload_kernel_nerf( return; } + if (render_mode == ERenderMode::Distortion) { + vec2 uv_after_distortion = pos_to_uv(ray(1.0f), resolution, focal_length, camera, screen_center, parallax_shift, foveation); + + frame_buffer[idx].rgb() = to_rgb((uv_after_distortion - uv) * 64.0f); + frame_buffer[idx].a = 1.0f; + depth_buffer[idx] = 1.0f; + payload.origin = ray(MAX_DEPTH()); + payload.alive = false; + return; + } + ray.d = normalize(ray.d); if (envmap) { @@ -1969,20 +1473,6 @@ __global__ void init_rays_with_payload_kernel_nerf( return; } - if (render_mode == ERenderMode::Distortion) { - vec2 offset = vec2(0.0f); - if (distortion) { - offset += distortion.at_lerp(vec2{(float)x + 0.5f, (float)y + 0.5f} / vec2(resolution)); - } - - frame_buffer[idx].rgb() = to_rgb(offset * 50.0f); - frame_buffer[idx].a = 1.0f; - depth_buffer[idx] = 1.0f; - payload.origin = ray(MAX_DEPTH()); - payload.alive = false; - return; - } - payload.origin = ray.o; payload.dir = ray.d; payload.t = t; @@ -2140,7 +1630,7 @@ void Testbed::NerfTracer::init_rays_from_camera( } uint32_t Testbed::NerfTracer::trace( - NerfNetwork& network, + const std::shared_ptr>& network, const BoundingBox& render_aabb, const mat3& render_aabb_to_local, const BoundingBox& train_aabb, @@ -2198,9 +1688,9 @@ uint32_t Testbed::NerfTracer::trace( // Want a large number of queries to saturate the GPU and to ensure compaction doesn't happen toooo frequently. uint32_t target_n_queries = 2 * 1024 * 1024; - uint32_t n_steps_between_compaction = tcnn::clamp(target_n_queries / n_alive, (uint32_t)MIN_STEPS_INBETWEEN_COMPACTION, (uint32_t)MAX_STEPS_INBETWEEN_COMPACTION); + uint32_t n_steps_between_compaction = clamp(target_n_queries / n_alive, (uint32_t)MIN_STEPS_INBETWEEN_COMPACTION, (uint32_t)MAX_STEPS_INBETWEEN_COMPACTION); - uint32_t extra_stride = network.n_extra_dims() * sizeof(float); + uint32_t extra_stride = network->n_extra_dims() * sizeof(float); PitchedPtr input_data((NerfCoordinate*)m_network_input, 1, 0, extra_stride); linear_kernel(generate_next_nerf_network_inputs, 0, stream, n_alive, @@ -2218,15 +1708,15 @@ uint32_t Testbed::NerfTracer::trace( cone_angle_constant, extra_dims_gpu ); - uint32_t n_elements = next_multiple(n_alive * n_steps_between_compaction, tcnn::batch_size_granularity); + uint32_t n_elements = next_multiple(n_alive * n_steps_between_compaction, BATCH_SIZE_GRANULARITY); GPUMatrix positions_matrix((float*)m_network_input, (sizeof(NerfCoordinate) + extra_stride) / sizeof(float), n_elements); - GPUMatrix rgbsigma_matrix((network_precision_t*)m_network_output, network.padded_output_width(), n_elements); - network.inference_mixed_precision(stream, positions_matrix, rgbsigma_matrix); + GPUMatrix rgbsigma_matrix((network_precision_t*)m_network_output, network->padded_output_width(), n_elements); + network->inference_mixed_precision(stream, positions_matrix, rgbsigma_matrix); if (render_mode == ERenderMode::Normals) { - network.input_gradient(stream, 3, positions_matrix, positions_matrix); + network->input_gradient(stream, 3, positions_matrix, positions_matrix); } else if (render_mode == ERenderMode::EncodingVis) { - network.visualize_activation(stream, visualized_layer, visualized_dim, positions_matrix, positions_matrix); + network->visualize_activation(stream, visualized_layer, visualized_dim, positions_matrix, positions_matrix); } linear_kernel(composite_kernel_nerf, 0, stream, @@ -2244,7 +1734,7 @@ uint32_t Testbed::NerfTracer::trace( rays_current.payload, input_data, m_network_output, - network.padded_output_width(), + network->padded_output_width(), n_steps_between_compaction, render_mode, grid, @@ -2264,7 +1754,7 @@ uint32_t Testbed::NerfTracer::trace( } void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_width, uint32_t n_extra_dims, cudaStream_t stream) { - n_elements = next_multiple(n_elements, size_t(tcnn::batch_size_granularity)); + n_elements = next_multiple(n_elements, size_t(BATCH_SIZE_GRANULARITY)); size_t num_floats = sizeof(NerfCoordinate) / sizeof(float) + n_extra_dims; auto scratch = allocate_workspace_and_distribute< vec4, float, NerfPayload, // m_rays[0] @@ -2297,26 +1787,21 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt m_alive_counter = std::get<12>(scratch); } -void Testbed::Nerf::Training::reset_extra_dims(default_rng_t& rng) { - uint32_t n_extra_dims = dataset.n_extra_dims(); - std::vector extra_dims_cpu(n_extra_dims * (dataset.n_images + 1)); // n_images + 1 since we use an extra 'slot' for the inference latent code - float* dst = extra_dims_cpu.data(); - extra_dims_opt = std::vector(dataset.n_images, VarAdamOptimizer(n_extra_dims, 1e-4f)); - for (uint32_t i = 0; i < dataset.n_images; ++i) { - vec3 light_dir = warp_direction(normalize(dataset.metadata[i].light_dir)); - extra_dims_opt[i].reset_state(); - std::vector& optimzer_value = extra_dims_opt[i].variable(); - for (uint32_t j = 0; j < n_extra_dims; ++j) { - if (dataset.has_light_dirs && j < 3) { - dst[j] = light_dir[j]; - } else { - dst[j] = random_val(rng) * 2.0f - 1.0f; - } - optimzer_value[j] = dst[j]; - } - dst += n_extra_dims; +std::vector Testbed::Nerf::Training::get_extra_dims_cpu(int trainview) const { + if (dataset.n_extra_dims() == 0) { + return {}; + } + + if (trainview < 0 || trainview >= dataset.n_images) { + throw std::runtime_error{"Invalid training view."}; } - extra_dims_gpu.resize_and_copy_from_host(extra_dims_cpu); + + const float* extra_dims_src = extra_dims_gpu.data() + trainview * dataset.n_extra_dims(); + + std::vector extra_dims_cpu(dataset.n_extra_dims()); + CUDA_CHECK_THROW(cudaMemcpy(extra_dims_cpu.data(), extra_dims_src, dataset.n_extra_dims() * sizeof(float), cudaMemcpyDeviceToHost)); + + return extra_dims_cpu; } void Testbed::Nerf::Training::update_extra_dims() { @@ -2332,29 +1817,11 @@ void Testbed::Nerf::Training::update_extra_dims() { CUDA_CHECK_THROW(cudaMemcpyAsync(extra_dims_gpu.data(), extra_dims_cpu.data(), extra_dims_opt.size() * n_extra_dims * sizeof(float), cudaMemcpyHostToDevice)); } -const float* Testbed::get_inference_extra_dims(cudaStream_t stream) const { - if (m_nerf_network->n_extra_dims() == 0) { - return nullptr; - } - const float* extra_dims_src = m_nerf.training.extra_dims_gpu.data() + m_nerf.extra_dim_idx_for_inference * m_nerf.training.dataset.n_extra_dims(); - if (!m_nerf.training.dataset.has_light_dirs) { - return extra_dims_src; - } - - // the dataset has light directions, so we must construct a temporary buffer and fill it as requested. - // we use an extra 'slot' that was pre-allocated for us at the end of the extra_dims array. - size_t size = m_nerf_network->n_extra_dims() * sizeof(float); - float* dims_gpu = m_nerf.training.extra_dims_gpu.data() + m_nerf.training.dataset.n_images * m_nerf.training.dataset.n_extra_dims(); - CUDA_CHECK_THROW(cudaMemcpyAsync(dims_gpu, extra_dims_src, size, cudaMemcpyDeviceToDevice, stream)); - vec3 light_dir = warp_direction(normalize(m_nerf.light_dir)); - CUDA_CHECK_THROW(cudaMemcpyAsync(dims_gpu, &light_dir, min(size, sizeof(vec3)), cudaMemcpyHostToDevice, stream)); - return dims_gpu; -} - void Testbed::render_nerf( cudaStream_t stream, + CudaDevice& device, const CudaRenderBufferView& render_buffer, - NerfNetwork& nerf_network, + const std::shared_ptr>& nerf_network, const uint8_t* density_grid_bitfield, const vec2& focal_length, const mat4x3& camera_matrix0, @@ -2371,20 +1838,25 @@ void Testbed::render_nerf( ERenderMode render_mode = visualized_dimension > -1 ? ERenderMode::EncodingVis : m_render_mode; - const float* extra_dims_gpu = get_inference_extra_dims(stream); + const float* extra_dims_gpu = m_nerf.get_rendering_extra_dims(stream); NerfTracer tracer; - // Our motion vector code can't undo grid distortions -- so don't render grid distortion if DLSS is enabled - auto grid_distortion = m_nerf.render_with_lens_distortion && !m_dlss ? m_distortion.inference_view() : Buffer2DView{}; + // Our motion vector code can't undo grid distortions -- so don't render grid distortion if DLSS is enabled. + // (Unless we're in distortion visualization mode, in which case the distortion grid is fine to visualize.) + auto grid_distortion = + m_nerf.render_with_lens_distortion && (!m_dlss || m_render_mode == ERenderMode::Distortion) ? + m_distortion.inference_view() : + Buffer2DView{}; + Lens lens = m_nerf.render_with_lens_distortion ? m_nerf.render_lens : Lens{}; auto resolution = render_buffer.resolution; tracer.init_rays_from_camera( render_buffer.spp, - nerf_network.padded_output_width(), - nerf_network.n_extra_dims(), + nerf_network->padded_output_width(), + nerf_network->n_extra_dims(), render_buffer.resolution, focal_length, camera_matrix0, @@ -2413,11 +1885,13 @@ void Testbed::render_nerf( stream ); + float depth_scale = 1.0f / m_nerf.training.dataset.scale; + bool render_2d = m_render_mode == ERenderMode::Slice || m_render_mode == ERenderMode::Distortion; + uint32_t n_hit; - if (m_render_mode == ERenderMode::Slice) { + if (render_2d) { n_hit = tracer.n_rays_initialized(); } else { - float depth_scale = 1.0f / m_nerf.training.dataset.scale; n_hit = tracer.trace( nerf_network, m_render_aabb, @@ -2442,28 +1916,31 @@ void Testbed::render_nerf( stream ); } - RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? tracer.rays_init() : tracer.rays_hit(); + RaysNerfSoa& rays_hit = render_2d ? tracer.rays_init() : tracer.rays_hit(); - if (m_render_mode == ERenderMode::Slice) { + if (render_2d) { // Store colors in the normal buffer - uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity); - const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + nerf_network.n_extra_dims(); - const uint32_t extra_stride = nerf_network.n_extra_dims() * sizeof(float); // extra stride on top of base NerfCoordinate struct + uint32_t n_elements = next_multiple(n_hit, BATCH_SIZE_GRANULARITY); + const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + nerf_network->n_extra_dims(); + const uint32_t extra_stride = nerf_network->n_extra_dims() * sizeof(float); // extra stride on top of base NerfCoordinate struct GPUMatrix positions_matrix{floats_per_coord, n_elements, stream}; GPUMatrix rgbsigma_matrix{4, n_elements, stream}; - linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr((NerfCoordinate*)positions_matrix.data(), 1, 0, extra_stride), extra_dims_gpu ); + linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr((NerfCoordinate*)positions_matrix.data(), 1, 0, extra_stride), extra_dims_gpu); if (visualized_dimension == -1) { - nerf_network.inference(stream, positions_matrix, rgbsigma_matrix); + nerf_network->inference(stream, positions_matrix, rgbsigma_matrix); linear_kernel(compute_nerf_rgba_kernel, 0, stream, n_hit, (vec4*)rgbsigma_matrix.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false); } else { - nerf_network.visualize_activation(stream, m_visualized_layer, visualized_dimension, positions_matrix, rgbsigma_matrix); + nerf_network->visualize_activation(stream, m_visualized_layer, visualized_dimension, positions_matrix, rgbsigma_matrix); } linear_kernel(shade_kernel_nerf, 0, stream, n_hit, + m_nerf.render_gbuffer_hard_edges, + camera_matrix1, + depth_scale, (vec4*)rgbsigma_matrix.data(), nullptr, rays_hit.payload, @@ -2477,6 +1954,9 @@ void Testbed::render_nerf( linear_kernel(shade_kernel_nerf, 0, stream, n_hit, + m_nerf.render_gbuffer_hard_edges, + camera_matrix1, + depth_scale, rays_hit.rgba, rays_hit.depth, rays_hit.payload, @@ -2601,7 +2081,7 @@ void Testbed::Nerf::Training::export_camera_extrinsics(const fs::path& path, boo mat4x3 Testbed::Nerf::Training::get_camera_extrinsics(int frame_idx) { if (frame_idx < 0 || frame_idx >= dataset.n_images) { - return mat4x3(1.0f); + return mat4x3::identity(); } return dataset.ngp_matrix_to_nerf(transforms[frame_idx].start); } @@ -2686,7 +2166,7 @@ void Testbed::load_nerf_post() { // moved the second half of load_nerf here m_nerf.training.cam_focal_length_gradient = vec2(0.0f); m_nerf.training.cam_focal_length_gradient_gpu.resize_and_copy_from_host(&m_nerf.training.cam_focal_length_gradient, 1); - m_nerf.training.reset_extra_dims(m_rng); + m_nerf.reset_extra_dims(m_rng); m_nerf.training.optimize_extra_dims = m_nerf.training.dataset.n_extra_learnable_dims > 0; if (m_nerf.training.dataset.has_rays) { @@ -2694,17 +2174,18 @@ void Testbed::load_nerf_post() { // moved the second half of load_nerf here } // Perturbation of the training cameras -- for debugging the online extrinsics learning code - // float perturb_amount = 0.0f; + // float perturb_amount = 0.01f; // if (perturb_amount > 0.f) { // for (uint32_t i = 0; i < m_nerf.training.dataset.n_images; ++i) { - // vec3 rot = random_val_3d(m_rng) * perturb_amount; - // float angle = rot.norm(); + // vec3 rot = (random_val_3d(m_rng) * 2.0f - 1.0f) * perturb_amount; + // vec3 trans = (random_val_3d(m_rng) * 2.0f - 1.0f) * perturb_amount; + // float angle = length(rot); // rot /= angle; - // auto trans = random_val_3d(m_rng); - // m_nerf.training.dataset.xforms[i].start.block<3,3>(0,0) = AngleAxisf(angle, rot).matrix() * m_nerf.training.dataset.xforms[i].start.block<3,3>(0,0); - // m_nerf.training.dataset.xforms[i].start[3] += trans * perturb_amount; - // m_nerf.training.dataset.xforms[i].end.block<3,3>(0,0) = AngleAxisf(angle, rot).matrix() * m_nerf.training.dataset.xforms[i].end.block<3,3>(0,0); - // m_nerf.training.dataset.xforms[i].end[3] += trans * perturb_amount; + + // auto rot_start = rotmat(angle, rot) * mat3(m_nerf.training.dataset.xforms[i].start); + // auto rot_end = rotmat(angle, rot) * mat3(m_nerf.training.dataset.xforms[i].end); + // m_nerf.training.dataset.xforms[i].start = mat4x3(rot_start[0], rot_start[1], rot_start[2], m_nerf.training.dataset.xforms[i].start[3] + trans); + // m_nerf.training.dataset.xforms[i].end = mat4x3(rot_end[0], rot_end[1], rot_end[2], m_nerf.training.dataset.xforms[i].end[3] + trans); // } // } @@ -2898,12 +2379,12 @@ __global__ void mark_density_grid_in_sphere_empty_kernel(const uint32_t n_elemen uint32_t level = i / NERF_GRID_N_CELLS(); uint32_t pos_idx = i % NERF_GRID_N_CELLS(); - uint32_t x = tcnn::morton3D_invert(pos_idx>>0); - uint32_t y = tcnn::morton3D_invert(pos_idx>>1); - uint32_t z = tcnn::morton3D_invert(pos_idx>>2); + uint32_t x = morton3D_invert(pos_idx>>0); + uint32_t y = morton3D_invert(pos_idx>>1); + uint32_t z = morton3D_invert(pos_idx>>2); float cell_radius = scalbnf(SQRT3(), level) / NERF_GRIDSIZE(); - vec3 cell_pos = ((vec3{(float)x+0.5f, (float)y+0.5f, (float)z+0.5f}) / (float)NERF_GRIDSIZE() - vec3(0.5f)) * scalbnf(1.0f, level) + vec3(0.5f); + vec3 cell_pos = ((vec3{(float)x+0.5f, (float)y+0.5f, (float)z+0.5f}) / (float)NERF_GRIDSIZE() - 0.5f) * scalbnf(1.0f, level) + 0.5f; // Disable if the cell touches the sphere (conservatively, by bounding the cell with a sphere) if (distance(pos, cell_pos) < radius + cell_radius) { @@ -2952,7 +2433,7 @@ float Testbed::NerfCounters::update_after_training(uint32_t target_batch_size, b } rays_per_batch = (uint32_t)((float)rays_per_batch * (float)target_batch_size / (float)measured_batch_size); - rays_per_batch = std::min(next_multiple(rays_per_batch, tcnn::batch_size_granularity), 1u << 18); + rays_per_batch = std::min(next_multiple(rays_per_batch, BATCH_SIZE_GRANULARITY), 1u << 18); return loss_scalar; } @@ -2998,7 +2479,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS uint32_t n_samples_per_image = (m_nerf.training.n_steps_between_error_map_updates * m_nerf.training.counters_rgb.rays_per_batch) / m_nerf.training.dataset.n_images; ivec2 res = m_nerf.training.dataset.metadata[0].resolution; m_nerf.training.error_map.resolution = min(ivec2((int)(std::sqrt(std::sqrt((float)n_samples_per_image)) * 3.5f)), res); - m_nerf.training.error_map.data.resize(compMul(m_nerf.training.error_map.resolution) * m_nerf.training.dataset.n_images); + m_nerf.training.error_map.data.resize(product(m_nerf.training.error_map.resolution) * m_nerf.training.dataset.n_images); CUDA_CHECK_THROW(cudaMemsetAsync(m_nerf.training.error_map.data.data(), 0, m_nerf.training.error_map.data.get_bytes(), stream)); } @@ -3007,16 +2488,15 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS CUDA_CHECK_THROW(cudaMemsetAsync(envmap_gradient, 0, sizeof(float)*m_envmap.envmap->n_params(), stream)); } - train_nerf_step(target_batch_size, m_nerf.training.counters_rgb, stream); - m_trainer->optimizer_step(stream, LOSS_SCALE); + m_trainer->optimizer_step(stream, LOSS_SCALE()); ++m_training_step; if (envmap_gradient) { - m_envmap.trainer->optimizer_step(stream, LOSS_SCALE); + m_envmap.trainer->optimizer_step(stream, LOSS_SCALE()); } float loss_scalar = m_nerf.training.counters_rgb.update_after_training(target_batch_size, get_loss_scalar, stream); @@ -3038,7 +2518,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS bool accumulate_error = true; if (accumulate_error && m_nerf.training.n_steps_since_error_map_update >= m_nerf.training.n_steps_between_error_map_updates) { m_nerf.training.error_map.cdf_resolution = m_nerf.training.error_map.resolution; - m_nerf.training.error_map.cdf_x_cond_y.resize(compMul(m_nerf.training.error_map.cdf_resolution) * m_nerf.training.dataset.n_images); + m_nerf.training.error_map.cdf_x_cond_y.resize(product(m_nerf.training.error_map.cdf_resolution) * m_nerf.training.dataset.n_images); m_nerf.training.error_map.cdf_y.resize(m_nerf.training.error_map.cdf_resolution.y * m_nerf.training.dataset.n_images); m_nerf.training.error_map.cdf_img.resize(m_nerf.training.dataset.n_images); @@ -3097,7 +2577,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS for (uint32_t i = 0; i < m_nerf.training.n_images_for_training; ++i) { std::vector gradient(n_extra_dims); for (uint32_t j = 0; j < n_extra_dims; ++j) { - gradient[j] = extra_dims_gradient[i * n_extra_dims + j] / LOSS_SCALE; + gradient[j] = extra_dims_gradient[i * n_extra_dims + j] / LOSS_SCALE(); } //float l2_reg = 1e-4f; @@ -3112,7 +2592,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS bool train_camera = m_nerf.training.optimize_extrinsics || m_nerf.training.optimize_distortion || m_nerf.training.optimize_focal_length || m_nerf.training.optimize_exposure; if (train_camera && m_nerf.training.n_steps_since_cam_update >= m_nerf.training.n_steps_between_cam_updates) { - float per_camera_loss_scale = (float)m_nerf.training.n_images_for_training / LOSS_SCALE / (float)m_nerf.training.n_steps_between_cam_updates; + float per_camera_loss_scale = (float)m_nerf.training.n_images_for_training / LOSS_SCALE() / (float)m_nerf.training.n_steps_between_cam_updates; if (m_nerf.training.optimize_extrinsics) { CUDA_CHECK_THROW(cudaMemcpyAsync(m_nerf.training.cam_pos_gradient.data(), m_nerf.training.cam_pos_gradient_gpu.data(), m_nerf.training.cam_pos_gradient_gpu.get_bytes(), cudaMemcpyDeviceToHost, stream)); @@ -3145,7 +2625,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS m_distortion.map->gradients(), m_distortion.map->gradient_weights() ); - m_distortion.trainer->optimizer_step(stream, LOSS_SCALE*(float)m_nerf.training.n_steps_between_cam_updates); + m_distortion.trainer->optimizer_step(stream, LOSS_SCALE() * (float)m_nerf.training.n_steps_between_cam_updates); } if (m_nerf.training.optimize_focal_length) { @@ -3161,7 +2641,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS if (m_nerf.training.optimize_exposure) { CUDA_CHECK_THROW(cudaMemcpyAsync(m_nerf.training.cam_exposure_gradient.data(), m_nerf.training.cam_exposure_gradient_gpu.data(), m_nerf.training.cam_exposure_gradient_gpu.get_bytes(), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); + vec3 mean_exposure = vec3(0.0f); // Optimization step @@ -3177,7 +2657,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS mean_exposure += m_nerf.training.cam_exposure[i].variable(); } - mean_exposure /= m_nerf.training.n_images_for_training; + mean_exposure /= (float)m_nerf.training.n_images_for_training; // Renormalize std::vector cam_exposures(m_nerf.training.n_images_for_training); @@ -3243,7 +2723,7 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters& if (counters.measured_batch_size_before_compaction == 0) { counters.measured_batch_size_before_compaction = max_inference = max_samples; } else { - max_inference = next_multiple(std::min(counters.measured_batch_size_before_compaction, max_samples), tcnn::batch_size_granularity); + max_inference = next_multiple(std::min(counters.measured_batch_size_before_compaction, max_samples), BATCH_SIZE_GRANULARITY); } GPUMatrix compacted_coords_matrix((float*)coords_compacted, floats_per_coord, target_batch_size); @@ -3273,6 +2753,7 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters& auto hg_enc = dynamic_cast*>(m_encoding.get()); + { linear_kernel(generate_training_samples_nerf, 0, stream, counters.rays_per_batch, m_aabb, @@ -3323,13 +2804,13 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters& m_rng, target_batch_size, ray_counter, - LOSS_SCALE, + LOSS_SCALE(), padded_output_width, m_envmap.view(), envmap_gradient, m_envmap.resolution, m_envmap.loss_type, - m_background_color.rgb, + m_background_color.rgb(), m_color_space, m_nerf.training.random_bg_color, m_nerf.training.linear_colors, @@ -3368,14 +2849,15 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters& m_nerf.training.depth_supervision_lambda, m_nerf.training.near_distance ); + } - fill_rollover_and_rescale<<>>( + fill_rollover_and_rescale<<>>( target_batch_size, padded_output_width, counters.numsteps_counter_compacted.data(), dloss_dmlp_out ); - fill_rollover<<>>( + fill_rollover<<>>( target_batch_size, floats_per_coord, counters.numsteps_counter_compacted.data(), (float*)coords_compacted ); - fill_rollover<<>>( + fill_rollover<<>>( target_batch_size, 1, counters.numsteps_counter_compacted.data(), max_level_compacted ); @@ -3384,10 +2866,7 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters& bool prepare_input_gradients = train_camera || train_extra_dims; GPUMatrix coords_gradient_matrix((float*)coords_gradient, floats_per_coord, target_batch_size); - { - auto ctx = m_network->forward(stream, compacted_coords_matrix, &compacted_rgbsigma_matrix, false, prepare_input_gradients); - m_network->backward(stream, *ctx, compacted_coords_matrix, compacted_rgbsigma_matrix, gradient_matrix, prepare_input_gradients ? &coords_gradient_matrix : nullptr, false, EGradientMode::Overwrite); - } + m_trainer->training_step(stream, compacted_coords_matrix, {}, nullptr, false, prepare_input_gradients ? &coords_gradient_matrix : nullptr, false, GradientMode::Overwrite, &gradient_matrix); if (train_extra_dims) { // Compute extra-dim gradients @@ -3473,7 +2952,7 @@ void Testbed::optimise_mesh_step(uint32_t n_steps) { GPUMatrix positions_matrix((float*)coords.data(), floats_per_coord, n_verts); GPUMatrix density_matrix(mlp_out.data(), padded_output_width, n_verts); - const float* extra_dims_gpu = get_inference_extra_dims(m_stream.get()); + const float* extra_dims_gpu = m_nerf.get_rendering_extra_dims(m_stream.get()); for (uint32_t i = 0; i < n_steps; ++i) { linear_kernel(generate_nerf_network_inputs_from_positions, 0, m_stream.get(), @@ -3521,7 +3000,7 @@ void Testbed::compute_mesh_vertex_colors() { m_mesh.vert_colors.memset(0); if (m_testbed_mode == ETestbedMode::Nerf) { - const float* extra_dims_gpu = get_inference_extra_dims(m_stream.get()); + const float* extra_dims_gpu = m_nerf.get_rendering_extra_dims(m_stream.get()); const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + m_nerf_network->n_extra_dims(); const uint32_t extra_stride = m_nerf_network->n_extra_dims() * sizeof(float); @@ -3591,7 +3070,7 @@ GPUMemory Testbed::get_rgba_on_grid(ivec3 res3d, vec3 ray_dir, bool voxel_ const uint32_t n_elements = (res3d.x*res3d.y*res3d.z); GPUMemory rgba(n_elements); - const float* extra_dims_gpu = get_inference_extra_dims(m_stream.get()); + const float* extra_dims_gpu = m_nerf.get_rendering_extra_dims(m_stream.get()); const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + m_nerf_network->n_extra_dims(); const uint32_t extra_stride = m_nerf_network->n_extra_dims() * sizeof(float); @@ -3672,18 +3151,105 @@ uint8_t* Testbed::Nerf::get_density_grid_bitfield_mip(uint32_t mip) { return density_grid_bitfield.data() + grid_mip_offset(mip)/8; } -int Testbed::find_best_training_view(int default_view) { - int bestimage = default_view; - float bestscore = 1000.f; - for (int i = 0; i < m_nerf.training.n_images_for_training; ++i) { - float score = distance(m_nerf.training.transforms[i].start[3], m_camera[3]); - score += 0.25f * distance(m_nerf.training.transforms[i].start[2], m_camera[2]); +void Testbed::Nerf::reset_extra_dims(default_rng_t& rng) { + uint32_t n_extra_dims = training.dataset.n_extra_dims(); + std::vector extra_dims_cpu(n_extra_dims * (training.dataset.n_images + 1)); // n_images + 1 since we use an extra 'slot' for the inference latent code + float* dst = extra_dims_cpu.data(); + training.extra_dims_opt = std::vector(training.dataset.n_images, VarAdamOptimizer(n_extra_dims, 1e-4f)); + for (uint32_t i = 0; i < training.dataset.n_images; ++i) { + vec3 light_dir = warp_direction(normalize(training.dataset.metadata[i].light_dir)); + training.extra_dims_opt[i].reset_state(); + std::vector& optimzer_value = training.extra_dims_opt[i].variable(); + for (uint32_t j = 0; j < n_extra_dims; ++j) { + if (training.dataset.has_light_dirs && j < 3) { + dst[j] = light_dir[j]; + } else { + dst[j] = random_val(rng) * 2.0f - 1.0f; + } + optimzer_value[j] = dst[j]; + } + dst += n_extra_dims; + } + training.extra_dims_gpu.resize_and_copy_from_host(extra_dims_cpu); + + rendering_extra_dims.resize(training.dataset.n_extra_dims()); + CUDA_CHECK_THROW(cudaMemcpy(rendering_extra_dims.data(), training.extra_dims_gpu.data(), rendering_extra_dims.bytes(), cudaMemcpyDeviceToDevice)); +} + +const float* Testbed::Nerf::get_rendering_extra_dims(cudaStream_t stream) const { + CHECK_THROW(rendering_extra_dims.size() == training.dataset.n_extra_dims()); + + if (training.dataset.n_extra_dims() == 0) { + return nullptr; + } + + const float* extra_dims_src = rendering_extra_dims_from_training_view >= 0 ? + training.extra_dims_gpu.data() + rendering_extra_dims_from_training_view * training.dataset.n_extra_dims() : + rendering_extra_dims.data(); + + if (!training.dataset.has_light_dirs) { + return extra_dims_src; + } + + // the dataset has light directions, so we must construct a temporary buffer and fill it as requested. + // we use an extra 'slot' that was pre-allocated for us at the end of the extra_dims array. + size_t size = training.dataset.n_extra_dims() * sizeof(float); + float* dims_gpu = training.extra_dims_gpu.data() + training.dataset.n_images * training.dataset.n_extra_dims(); + CUDA_CHECK_THROW(cudaMemcpyAsync(dims_gpu, extra_dims_src, size, cudaMemcpyDeviceToDevice, stream)); + vec3 light_dir = warp_direction(normalize(light_dir)); + CUDA_CHECK_THROW(cudaMemcpyAsync(dims_gpu, &light_dir, min(size, sizeof(vec3)), cudaMemcpyHostToDevice, stream)); + return dims_gpu; +} + +int Testbed::Nerf::find_closest_training_view(mat4x3 pose) const { + int bestimage = training.view; + float bestscore = std::numeric_limits::infinity(); + for (int i = 0; i < training.n_images_for_training; ++i) { + float score = distance(training.transforms[i].start[3], pose[3]); + score += 0.25f * distance(training.transforms[i].start[2], pose[2]); if (score < bestscore) { bestscore = score; bestimage = i; } } + return bestimage; } -NGP_NAMESPACE_END +void Testbed::Nerf::set_rendering_extra_dims_from_training_view(int trainview) { + if (!training.dataset.n_extra_dims()) { + throw std::runtime_error{"Dataset does not have extra dims."}; + } + + if (trainview < 0 || trainview >= training.dataset.n_images) { + throw std::runtime_error{"Invalid training view."}; + } + + rendering_extra_dims_from_training_view = trainview; +} + +void Testbed::Nerf::set_rendering_extra_dims(const std::vector& vals) { + CHECK_THROW(rendering_extra_dims.size() == training.dataset.n_extra_dims()); + + if (vals.size() != training.dataset.n_extra_dims()) { + throw std::runtime_error{fmt::format("Invalid number of extra dims. Got {} but must be {}.", vals.size(), training.dataset.n_extra_dims())}; + } + + rendering_extra_dims_from_training_view = -1; + rendering_extra_dims.copy_from_host(vals); +} + +std::vector Testbed::Nerf::get_rendering_extra_dims_cpu() const { + CHECK_THROW(rendering_extra_dims.size() == training.dataset.n_extra_dims()); + + if (training.dataset.n_extra_dims() == 0) { + return {}; + } + + std::vector extra_dims_cpu(training.dataset.n_extra_dims()); + CUDA_CHECK_THROW(cudaMemcpy(extra_dims_cpu.data(), get_rendering_extra_dims(nullptr), rendering_extra_dims.bytes(), cudaMemcpyDeviceToHost)); + + return extra_dims_cpu; +} + +} diff --git a/src/testbed_sdf.cu b/src/testbed_sdf.cu index 1aa41b4b8..006d53781 100644 --- a/src/testbed_sdf.cu +++ b/src/testbed_sdf.cu @@ -30,9 +30,7 @@ #include #include -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { static constexpr uint32_t MARCH_ITER = 10000; @@ -108,12 +106,12 @@ __device__ vec3 evaluate_shading( return amb; } - float luminance = dot(base_color, vec3(0.3f, 0.6f, 0.1f)); + float luminance = dot(base_color, vec3{0.3f, 0.6f, 0.1f}); // normalize luminance to isolate hue and saturation components vec3 Ctint = base_color * (1.f/(luminance+0.00001f)); - vec3 Cspec0 = mix(mix(vec3(1.0f,1.0f,1.0f), Ctint, specular_tint) * specular * 0.08f, base_color, metallic); - vec3 Csheen = mix(vec3(1.0f,1.0f,1.0f), Ctint, sheen_tint); + vec3 Cspec0 = mix(mix(vec3(1.0f), Ctint, specular_tint) * specular * 0.08f, base_color, metallic); + vec3 Csheen = mix(vec3(1.0f), Ctint, sheen_tint); float Fd90 = 0.5f + 2.0f * LdotH * LdotH * roughness; float Fd = mix(1, Fd90, FL) * mix(1.f, Fd90, FV); @@ -129,7 +127,7 @@ __device__ vec3 evaluate_shading( float a= std::max(0.001f, square(roughness)); float Ds = G2(NdotH, a); float FH = SchlickFresnel(LdotH); - vec3 Fs = mix(Cspec0, vec3(1.0f,1.0f,1.0f), FH); + vec3 Fs = mix(Cspec0, vec3(1.0f), FH); float Gs = SmithG_GGX(NdotL, a) * SmithG_GGX(NdotV, a); // sheen @@ -142,7 +140,7 @@ __device__ vec3 evaluate_shading( float CCs=0.25f * clearcoat * Gr * Fr * Dr; vec3 brdf = (float(1.0f / PI()) * mix(Fd, ss, subsurface) * base_color + Fsheen) * (1.0f - metallic) + - Gs * Fs * Ds + vec3(CCs,CCs,CCs); + Gs * Fs * Ds + vec3{CCs, CCs, CCs}; return vec3(brdf * light_color) * NdotL + amb; } @@ -324,7 +322,7 @@ __global__ void shade_kernel_sdf( vec3 pos = positions[i]; bool floor = false; if (pos.y < floor_y + 0.001f && payload.dir.y < 0.f) { - normal = vec3(0.f, 1.f, 0.f); + normal = vec3{0.0f, 1.0f, 0.0f}; floor = true; } @@ -361,14 +359,14 @@ __global__ void shade_kernel_sdf( } break; case ERenderMode::Depth: color = vec3(dot(cam_fwd, pos - cam_pos)); break; case ERenderMode::Positions: { - color = (pos - vec3(0.5f)) / 2.0f + vec3(0.5f); + color = (pos - 0.5f) / 2.0f + 0.5f; } break; - case ERenderMode::Normals: color = 0.5f * normal + vec3(0.5f); break; + case ERenderMode::Normals: color = 0.5f * normal + 0.5f; break; case ERenderMode::Cost: color = vec3((float)payload.n_steps / 30); break; case ERenderMode::EncodingVis: color = normals[i]; break; } - frame_buffer[payload.idx] = {color.rgb, 1.0f}; + frame_buffer[payload.idx] = {color.r, color.g, color.b, 1.0f}; depth_buffer[payload.idx] = dot(cam_fwd, pos - cam_pos); } @@ -543,7 +541,7 @@ __global__ void init_rays_with_payload_kernel_sdf( Ray ray = pixel_to_ray( sample_index, - {x, y}, + {(int)x, (int)y}, resolution, focal_length, camera_matrix, @@ -798,7 +796,7 @@ uint32_t Testbed::SphereTracer::trace( } void Testbed::SphereTracer::enlarge(size_t n_elements, cudaStream_t stream) { - n_elements = next_multiple(n_elements, size_t(tcnn::batch_size_granularity)); + n_elements = next_multiple(n_elements, size_t(BATCH_SIZE_GRANULARITY)); auto scratch = allocate_workspace_and_distribute< vec3, vec3, float, float, float, float, SdfPayload, // m_rays[0] vec3, vec3, float, float, float, float, SdfPayload, // m_rays[1] @@ -824,7 +822,7 @@ void Testbed::SphereTracer::enlarge(size_t n_elements, cudaStream_t stream) { } void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements, cudaStream_t stream) { - n_elements = next_multiple(n_elements, tcnn::batch_size_granularity); + n_elements = next_multiple(n_elements, BATCH_SIZE_GRANULARITY); auto scratch = allocate_workspace_and_distribute< vec3, vec3, vec3, float, float, float, @@ -960,10 +958,10 @@ void Testbed::render_sdf( if (m_render_mode == ERenderMode::Slice) { if (visualized_dimension == -1) { distance_function(n_hit, rays_hit.pos, rays_hit.distance, stream); - extract_dimension_pos_neg_kernel<<>>(n_hit*3, 0, 1, 3, rays_hit.distance, CM, (float*)rays_hit.normal); + extract_dimension_pos_neg_kernel<<>>(n_hit*3, 0, 1, 3, rays_hit.distance, CM, (float*)rays_hit.normal); } else { // Store colors in the normal buffer - uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity); + uint32_t n_elements = next_multiple(n_hit, BATCH_SIZE_GRANULARITY); GPUMatrix positions_matrix((float*)rays_hit.pos, 3, n_elements); GPUMatrix colors_matrix((float*)rays_hit.normal, 3, n_elements); @@ -1024,7 +1022,7 @@ void Testbed::render_sdf( } } else if (render_mode == ERenderMode::EncodingVis && m_render_mode != ERenderMode::Slice) { // HACK: Store colors temporarily in the normal buffer - uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity); + uint32_t n_elements = next_multiple(n_hit, BATCH_SIZE_GRANULARITY); GPUMatrix positions_matrix((float*)rays_hit.pos, 3, n_elements); GPUMatrix colors_matrix((float*)rays_hit.normal, 3, n_elements); @@ -1124,13 +1122,13 @@ void Testbed::load_mesh(const fs::path& data_path) { const float inflation = 0.005f; m_raw_aabb.inflate(length(m_raw_aabb.diag()) * inflation); - m_sdf.mesh_scale = compMax(m_raw_aabb.diag()); + m_sdf.mesh_scale = max(m_raw_aabb.diag()); // Normalize vertex coordinates to lie within [0,1]^3. // This way, none of the constants need to carry around // bounding box factors. for (size_t i = 0; i < n_vertices; ++i) { - vertices[i] = (vertices[i] - m_raw_aabb.min - 0.5f * m_raw_aabb.diag()) / m_sdf.mesh_scale + vec3(0.5f); + vertices[i] = (vertices[i] - m_raw_aabb.min - 0.5f * m_raw_aabb.diag()) / m_sdf.mesh_scale + 0.5f; } m_aabb = {}; @@ -1141,7 +1139,7 @@ void Testbed::load_mesh(const fs::path& data_path) { m_aabb.inflate(length(m_aabb.diag()) * inflation); m_aabb = m_aabb.intersection(BoundingBox{vec3(0.0f), vec3(1.0f)}); m_render_aabb = m_aabb; - m_render_aabb_to_local = mat3(1.0f); + m_render_aabb_to_local = mat3::identity(); m_mesh.thresh = 0.f; m_sdf.triangles_cpu.resize(n_triangles); @@ -1397,4 +1395,4 @@ double Testbed::calculate_iou(uint32_t n_samples, float scale_existing_results_f return countercpu[4]/double(countercpu[5]); } -NGP_NAMESPACE_END +} diff --git a/src/testbed_volume.cu b/src/testbed_volume.cu index 63581115e..efdfff42e 100644 --- a/src/testbed_volume.cu +++ b/src/testbed_volume.cu @@ -31,9 +31,7 @@ #include -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { Testbed::NetworkDims Testbed::network_dims_volume() const { NetworkDims dims; @@ -54,7 +52,7 @@ __device__ vec4 proc_envmap(const vec3& dir, const vec3& up_dir, const vec3& sun sunam *= sunam; vec4 result; - result.rgb = skycol * skyam + vec3{255.f/255.0f, 215.f/255.0f, 195.f/255.0f} * (20.f * sunam); + result.rgb() = skycol * skyam + vec3{255.f/255.0f, 215.f/255.0f, 195.f/255.0f} * (20.f * sunam); result.a = 1.0f; return result; } @@ -75,7 +73,7 @@ __device__ inline bool walk_to_next_event(default_rng_t &rng, const BoundingBox float dt = -std::log(1.0f - zeta1) * scale; // todo - for spatially varying majorant, we must check dt against the range over which the majorant is defined. we can turn this into an optical thickness accumulating loop... pos += dir*dt; if (!aabb.contains(pos)) return false; // escape to the mooon! - uint32_t bitidx = tcnn::morton3D(int(pos.x*128.f+0.5f),int(pos.y*128.f+0.5f),int(pos.z*128.f+0.5f)); + uint32_t bitidx = morton3D(int(pos.x*128.f+0.5f),int(pos.y*128.f+0.5f),int(pos.z*128.f+0.5f)); if (bitidx<128*128*128 && bitgrid[bitidx>>3]&(1<<(bitidx&7))) break; // loop around and try again as we are in density=0 region! } @@ -112,7 +110,7 @@ __global__ void volume_generate_training_data_kernel(uint32_t n_elements, auto acc = grid->tree().getAccessor(); while (numout < MAX_TRAIN_VERTICES) { uint32_t prev_numout = numout; - vec3 pos = random_dir(rng) * 2.0f + vec3(0.5f); + vec3 pos = random_dir(rng) * 2.0f + 0.5f; vec3 target = random_val_3d(rng) * aabb.diag() + aabb.min; vec3 dir = normalize(target - pos); auto box_intersection = aabb.ray_intersect(pos, dir); @@ -169,25 +167,25 @@ void Testbed::train_volume(size_t target_batch_size, bool get_loss_scalar, cudaS m_volume.training.targets.enlarge(n_elements); float distance_scale = 1.f/std::max(m_volume.inv_distance_scale,0.01f); - auto sky_col = m_background_color.rgb; + auto sky_col = m_background_color.rgb(); linear_kernel(volume_generate_training_data_kernel, 0, stream, n_elements / MAX_TRAIN_VERTICES, - m_volume.training.positions.data(), - m_volume.training.targets.data(), - m_volume.nanovdb_grid.data(), - m_volume.bitgrid.data(), - m_volume.world2index_offset, - m_volume.world2index_scale, - m_render_aabb, - m_rng, - m_volume.albedo, - m_volume.scattering, - distance_scale, - m_volume.global_majorant, - m_up_dir, - m_sun_dir, - sky_col - ); + m_volume.training.positions.data(), + m_volume.training.targets.data(), + m_volume.nanovdb_grid.data(), + m_volume.bitgrid.data(), + m_volume.world2index_offset, + m_volume.world2index_scale, + m_render_aabb, + m_rng, + m_volume.albedo, + m_volume.scattering, + distance_scale, + m_volume.global_majorant, + m_up_dir, + m_sun_dir, + sky_col + ); m_rng.advance(n_elements*256); GPUMatrix training_batch_matrix((float*)(m_volume.training.positions.data()), n_input_dims, batch_size); @@ -243,7 +241,7 @@ __global__ void init_rays_volume( Ray ray = pixel_to_ray( sample_index, - {x, y}, + {(int)x, (int)y}, resolution, focal_length, camera_matrix, @@ -398,7 +396,7 @@ __global__ void volume_render_kernel_step( if (extinction_prob>1.f) extinction_prob=1.f; float T = 1.f - payload.col.a; float alpha = extinction_prob * T; - payload.col.rgb += local_output.rgb * alpha; + payload.col.rgb() += local_output.rgb() * alpha; payload.col.a += alpha; if (payload.col.a > 0.99f || !walk_to_next_event(rng, aabb, pos, dir, bitgrid, scale) || force_finish_ray) { payload.col += (1.f-payload.col.a) * proc_envmap_render(dir, up_dir, sun_dir, sky_col); @@ -423,14 +421,14 @@ void Testbed::render_volume( auto res = render_buffer.resolution; size_t n_pixels = (size_t)res.x * res.y; - for (uint32_t i=0;i<2;++i) { + for (uint32_t i = 0; i < 2; ++i) { m_volume.pos[i].enlarge(n_pixels); m_volume.payload[i].enlarge(n_pixels); } m_volume.hit_counter.enlarge(2); m_volume.hit_counter.memset(0); - vec3 sky_col = m_background_color.rgb; + vec3 sky_col = m_background_color.rgb(); const dim3 threads = { 16, 8, 1 }; const dim3 blocks = { div_round_up((uint32_t)res.x, threads.x), div_round_up((uint32_t)res.y, threads.y), 1 }; @@ -462,11 +460,11 @@ void Testbed::render_volume( m_sun_dir, sky_col ); - m_rng.advance(n_pixels*256); + m_rng.advance(n_pixels * 256); - uint32_t n=n_pixels; - CUDA_CHECK_THROW(cudaDeviceSynchronize()); - cudaMemcpy(&n, m_volume.hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost); + uint32_t n = n_pixels; + CUDA_CHECK_THROW(cudaMemcpyAsync(&n, m_volume.hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); if (m_render_ground_truth) { linear_kernel(volume_render_kernel_gt, 0, stream, @@ -491,21 +489,21 @@ void Testbed::render_volume( m_volume.scattering, render_buffer.frame_buffer ); - m_rng.advance(n_pixels*256); + m_rng.advance(n_pixels * 256); } else { m_volume.radiance_and_density.enlarge(n); int max_iter = 64; - for (int iter=0;iter0;++iter) { - uint32_t srcbuf=(iter&1); - uint32_t dstbuf=1-srcbuf; + for (int iter = 0; iter < max_iter && n > 0; ++iter) { + uint32_t srcbuf = (iter & 1); + uint32_t dstbuf = 1 - srcbuf; - uint32_t n_elements = next_multiple(n, tcnn::batch_size_granularity); + uint32_t n_elements = next_multiple(n, BATCH_SIZE_GRANULARITY); GPUMatrix positions_matrix((float*)m_volume.pos[srcbuf].data(), 3, n_elements); GPUMatrix densities_matrix((float*)m_volume.radiance_and_density.data(), 4, n_elements); m_network->inference(stream, positions_matrix, densities_matrix); - cudaMemsetAsync(m_volume.hit_counter.data()+dstbuf,0,sizeof(uint32_t)); + CUDA_CHECK_THROW(cudaMemsetAsync(m_volume.hit_counter.data() + dstbuf, 0, sizeof(uint32_t), stream)); linear_kernel(volume_render_kernel_step, 0, stream, n, @@ -533,11 +531,12 @@ void Testbed::render_volume( render_buffer.frame_buffer, (iter>=max_iter-1) ); - m_rng.advance(n_pixels*256); - if (((iter+1) % 4)==0) { + + m_rng.advance(n_pixels * 256); + if (((iter + 1) % 4) == 0) { // periodically tell the cpu how many pixels are left - CUDA_CHECK_THROW(cudaMemcpyAsync(&n, m_volume.hit_counter.data()+dstbuf, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK_THROW(cudaDeviceSynchronize()); + CUDA_CHECK_THROW(cudaMemcpyAsync(&n, m_volume.hit_counter.data() + dstbuf, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK_THROW(cudaStreamSynchronize(stream)); } } } @@ -546,27 +545,27 @@ void Testbed::render_volume( #define NANOVDB_MAGIC_NUMBER 0x304244566f6e614eUL // "NanoVDB0" in hex - little endian (uint64_t) struct NanoVDBFileHeader { - uint64_t magic; // 8 bytes - uint32_t version; // 4 bytes version numbers - uint16_t gridCount; // 2 bytes - uint16_t codec; // 2 bytes - must be 0 + uint64_t magic; // 8 bytes + uint32_t version; // 4 bytes version numbers + uint16_t gridCount; // 2 bytes + uint16_t codec; // 2 bytes - must be 0 }; static_assert(sizeof(NanoVDBFileHeader) == 16, "nanovdb padding error"); struct NanoVDBMetaData { - uint64_t gridSize, fileSize, nameKey, voxelCount; // 4 * 8 = 32B. - uint32_t gridType; // 4B. - uint32_t gridClass; // 4B. - double worldBBox[2][3]; // 2 * 3 * 8 = 48B. - int indexBBox[2][3]; // 2 * 3 * 4 = 24B. - double voxelSize[3]; // 24B. - uint32_t nameSize; // 4B. - uint32_t nodeCount[4]; // 4 x 4 = 16B - uint32_t tileCount[3]; // 3 x 4 = 12B - uint16_t codec; // 2B - uint16_t padding; // 2B, due to 8B alignment from uint64_t - uint32_t version; // 4B + uint64_t gridSize, fileSize, nameKey, voxelCount; // 4 * 8 = 32B. + uint32_t gridType; // 4B. + uint32_t gridClass; // 4B. + double worldBBox[2][3]; // 2 * 3 * 8 = 48B. + int indexBBox[2][3]; // 2 * 3 * 4 = 24B. + double voxelSize[3]; // 24B. + uint32_t nameSize; // 4B. + uint32_t nodeCount[4]; // 4 x 4 = 16B + uint32_t tileCount[3]; // 3 x 4 = 12B + uint16_t codec; // 2B + uint16_t padding; // 2B, due to 8B alignment from uint64_t + uint32_t version; // 4B }; static_assert(sizeof(NanoVDBMetaData) == 176, "nanovdb padding error"); @@ -617,7 +616,7 @@ void Testbed::load_volume(const fs::path& data_path) { vec3{0.5f - xsize * scale * 0.5f, 0.5f - ysize * scale * 0.5f, 0.5f - zsize * scale * 0.5f}, vec3{0.5f + xsize * scale * 0.5f, 0.5f + ysize * scale * 0.5f, 0.5f + zsize * scale * 0.5f}, }; - m_render_aabb_to_local = mat3(1.0f); + m_render_aabb_to_local = mat3::identity(); m_volume.world2index_scale = maxsize; m_volume.world2index_offset = vec3{ @@ -639,7 +638,7 @@ void Testbed::load_volume(const fs::path& data_path) { float fx = ((i + 0.5f) - m_volume.world2index_offset.x) / m_volume.world2index_scale; float fy = ((j + 0.5f) - m_volume.world2index_offset.y) / m_volume.world2index_scale; float fz = ((k + 0.5f) - m_volume.world2index_offset.z) / m_volume.world2index_scale; - uint32_t bitidx = tcnn::morton3D(int(fx * 128.0f + 0.5f), int(fy * 128.0f + 0.5f), int(fz * 128.0f + 0.5f)); + uint32_t bitidx = morton3D(int(fx * 128.0f + 0.5f), int(fy * 128.0f + 0.5f), int(fz * 128.0f + 0.5f)); if (bitidx < 128 * 128 * 128) bitgrid[bitidx / 8] |= 1 << (bitidx & 7); } @@ -650,4 +649,4 @@ void Testbed::load_volume(const fs::path& data_path) { m_volume.global_majorant = mx; } -NGP_NAMESPACE_END +} diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp index 6939ecdba..146d372d5 100644 --- a/src/thread_pool.cpp +++ b/src/thread_pool.cpp @@ -16,7 +16,7 @@ #include -NGP_NAMESPACE_BEGIN +namespace ngp { ThreadPool::ThreadPool() : ThreadPool{std::thread::hardware_concurrency()} {} @@ -98,4 +98,4 @@ void ThreadPool::flush_queue() { m_task_queue.clear(); } -NGP_NAMESPACE_END +} diff --git a/src/tinyexr_wrapper.cu b/src/tinyexr_wrapper.cu index 00e451a81..0ff33118e 100644 --- a/src/tinyexr_wrapper.cu +++ b/src/tinyexr_wrapper.cu @@ -20,7 +20,7 @@ #include -#ifdef __NVCC__ +#ifdef __CUDACC__ # ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ # pragma nv_diag_suppress 174 # pragma nv_diag_suppress 550 @@ -33,9 +33,7 @@ #define TINYEXR_IMPLEMENTATION #include -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { template __global__ void interleave_and_cast_kernel(const uint32_t num_pixels, bool has_alpha, const T* __restrict__ in, __half* __restrict__ out, bool fix_pre_mult) { @@ -257,4 +255,4 @@ __half* load_exr_to_gpu(int* width, int* height, const fs::path& path, bool fix_ return result; } -NGP_NAMESPACE_END +} diff --git a/src/tinyobj_loader_wrapper.cpp b/src/tinyobj_loader_wrapper.cu similarity index 95% rename from src/tinyobj_loader_wrapper.cpp rename to src/tinyobj_loader_wrapper.cu index 0a0fe8e69..5b844ef07 100644 --- a/src/tinyobj_loader_wrapper.cpp +++ b/src/tinyobj_loader_wrapper.cu @@ -14,7 +14,7 @@ * interface to load OBJ-based meshes. */ -#include +#include #include #include @@ -22,10 +22,9 @@ #define TINYOBJLOADER_IMPLEMENTATION #include -#include #include -NGP_NAMESPACE_BEGIN +namespace ngp { std::vector load_obj(const fs::path& path) { tinyobj::attrib_t attrib; @@ -81,4 +80,4 @@ std::vector load_obj(const fs::path& path) { return result; } -NGP_NAMESPACE_END +} diff --git a/src/triangle_bvh.cu b/src/triangle_bvh.cu index 7f8092741..0e091bea6 100644 --- a/src/triangle_bvh.cu +++ b/src/triangle_bvh.cu @@ -12,8 +12,9 @@ * @author Thomas Müller & Alex Evans, NVIDIA */ -#include +#include #include + #include #include @@ -38,9 +39,7 @@ namespace optix_ptx { } #endif //NGP_OPTIX -using namespace tcnn; - -NGP_NAMESPACE_BEGIN +namespace ngp { constexpr float MAX_DIST = 10.0f; @@ -409,7 +408,7 @@ public: vec3 closest_point = tri.closest_point(point); vec3 avg_normal = avg_normal_around_point(closest_point, bvhnodes, triangles); - return std::copysignf(p.second, dot(avg_normal, point - closest_point)); + return copysign(p.second, dot(avg_normal, point - closest_point)); } __host__ __device__ static float signed_distance_raystab(const vec3& point, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, float max_distance_sq, default_rng_t rng={}) { @@ -543,7 +542,7 @@ public: // Root m_nodes.emplace_back(); - m_nodes.front().bb = BoundingBox(std::begin(triangles), std::end(triangles)); + m_nodes.front().bb = BoundingBox(triangles.data(), triangles.data() + triangles.size()); struct BuildNode { int node_idx; @@ -584,7 +583,7 @@ public: } var /= (float)std::distance(child.begin, child.end); - float max_val = compMax(var); + float max_val = max(var); int axis = var.x == max_val ? 0 : (var.y == max_val ? 1 : 2); auto m = child.begin + std::distance(child.begin, child.end)/2; @@ -606,7 +605,7 @@ public: child.node_idx = (int)m_nodes.size(); m_nodes.emplace_back(); - m_nodes.back().bb = BoundingBox(child.begin, child.end); + m_nodes.back().bb = BoundingBox(&*child.begin, &*child.end); if (std::distance(child.begin, child.end) <= n_primitives_per_leaf) { m_nodes.back().left_idx = -(int)std::distance(std::begin(triangles), child.begin)-1; @@ -721,6 +720,6 @@ __global__ void raytrace_kernel(uint32_t n_elements, vec3* __restrict__ position } } -NGP_NAMESPACE_END +}