From b9b7753ba5a0fa280c837a7c4f088f9c637ad575 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20M=C3=BCller?= <thomas94@gmx.net>
Date: Sun, 9 Jul 2023 11:34:03 +0200
Subject: [PATCH] Replace GLM with tcnn's vector math, bugfixes and extra
 bindings

---
 .gitmodules                                   |   3 -
 CMakeLists.txt                                |  10 +-
 dependencies/glm                              |   1 -
 dependencies/tiny-cuda-nn                     |   2 +-
 .../adam_optimizer.h                          |   6 +-
 .../bounding_box.cuh                          |  36 +-
 .../neural-graphics-primitives/camera_path.h  |  10 +-
 include/neural-graphics-primitives/common.h   | 194 +---
 .../common_device.cuh                         | 184 +---
 .../neural-graphics-primitives/common_host.h  | 170 ++++
 .../discrete_distribution.h                   |   6 +-
 include/neural-graphics-primitives/dlss.h     |   4 +-
 include/neural-graphics-primitives/envmap.cuh |  17 +-
 .../neural-graphics-primitives/json_binding.h | 137 +--
 .../marching_cubes.h                          |  41 +-
 include/neural-graphics-primitives/nerf.h     |  77 +-
 .../nerf_device.cuh                           | 617 +++++++++++
 .../neural-graphics-primitives/nerf_loader.h  |  54 +-
 .../neural-graphics-primitives/nerf_network.h | 166 +--
 .../neural-graphics-primitives/openxr_hmd.h   |   4 +-
 .../neural-graphics-primitives/random_val.cuh |  14 +-
 .../render_buffer.h                           |  12 +-
 include/neural-graphics-primitives/sdf.h      |  10 +-
 .../neural-graphics-primitives/shared_queue.h |   4 +-
 .../takikawa_encoding.cuh                     | 118 +--
 include/neural-graphics-primitives/testbed.h  | 237 +++--
 .../neural-graphics-primitives/thread_pool.h  |   4 +-
 .../tinyexr_wrapper.h                         |   6 +-
 .../tinyobj_loader_wrapper.h                  |   4 +-
 .../trainable_buffer.cuh                      |  32 +-
 .../neural-graphics-primitives/triangle.cuh   |  29 +-
 .../triangle_bvh.cuh                          |   8 +-
 .../triangle_octree.cuh                       |  18 +-
 scripts/run.py                                |   3 +-
 src/camera_path.cu                            |  16 +-
 src/common_device.cu                          | 247 -----
 src/{common.cu => common_host.cu}             |  86 +-
 src/dlss.cu                                   |  14 +-
 src/main.cu                                   |   5 +-
 src/marching_cubes.cu                         |  49 +-
 src/nerf_loader.cu                            |  31 +-
 src/openxr_hmd.cu                             |   8 +-
 src/optix/pathescape.cu                       |   7 +-
 src/optix/pathescape.h                        |   4 +-
 src/optix/program.h                           |   5 +-
 src/optix/raystab.cu                          |   6 +-
 src/optix/raystab.h                           |   4 +-
 src/optix/raytrace.cu                         |   5 +-
 src/optix/raytrace.h                          |   4 +-
 src/python_api.cu                             |  32 +-
 src/render_buffer.cu                          |  62 +-
 src/testbed.cu                                | 314 +++---
 src/testbed_image.cu                          |  50 +-
 src/testbed_nerf.cu                           | 958 +++++-------------
 src/testbed_sdf.cu                            |  42 +-
 src/testbed_volume.cu                         | 119 ++-
 src/thread_pool.cpp                           |   4 +-
 src/tinyexr_wrapper.cu                        |   8 +-
 ..._wrapper.cpp => tinyobj_loader_wrapper.cu} |   7 +-
 src/triangle_bvh.cu                           |  17 +-
 60 files changed, 2023 insertions(+), 2319 deletions(-)
 delete mode 160000 dependencies/glm
 create mode 100644 include/neural-graphics-primitives/common_host.h
 create mode 100644 include/neural-graphics-primitives/nerf_device.cuh
 delete mode 100644 src/common_device.cu
 rename src/{common.cu => common_host.cu} (76%)
 rename src/{tinyobj_loader_wrapper.cpp => tinyobj_loader_wrapper.cu} (95%)

diff --git a/.gitmodules b/.gitmodules
index 4ef3aef16..1c90e8436 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -28,6 +28,3 @@
 [submodule "dependencies/OpenXR-SDK"]
 	path = dependencies/OpenXR-SDK
 	url = https://github.com/KhronosGroup/OpenXR-SDK.git
-[submodule "dependencies/glm"]
-	path = dependencies/glm
-	url = https://github.com/g-truc/glm
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6ac83e9a..9fce957e7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,7 +54,6 @@ if (MSVC)
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /D_CRT_SECURE_NO_WARNINGS")
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP24")
 else()
-	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fms-extensions")
 	set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
 endif()
 
@@ -76,7 +75,6 @@ if (MSVC)
 else()
 	list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=-Wno-float-conversion")
 	list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=-fno-strict-aliasing")
-	list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=-fms-extensions")
 	list(APPEND CUDA_NVCC_FLAGS "-Xcompiler=-fPIC")
 endif()
 list(APPEND CUDA_NVCC_FLAGS "--extended-lambda")
@@ -203,7 +201,6 @@ endif(NGP_BUILD_WITH_GUI)
 list(APPEND NGP_INCLUDE_DIRECTORIES
 	"dependencies"
 	"dependencies/filesystem"
-	"dependencies/glm"
 	"dependencies/nanovdb"
 	"dependencies/NaturalSort"
 	"dependencies/tinylogger"
@@ -261,8 +258,7 @@ endif()
 list(APPEND NGP_SOURCES
 	${GUI_SOURCES}
 	src/camera_path.cu
-	src/common.cu
-	src/common_device.cu
+	src/common_host.cu
 	src/marching_cubes.cu
 	src/nerf_loader.cu
 	src/render_buffer.cu
@@ -273,7 +269,7 @@ list(APPEND NGP_SOURCES
 	src/testbed_volume.cu
 	src/thread_pool.cpp
 	src/tinyexr_wrapper.cu
-	src/tinyobj_loader_wrapper.cpp
+	src/tinyobj_loader_wrapper.cu
 	src/triangle_bvh.cu
 )
 
@@ -284,6 +280,8 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${CMAKE_BINARY_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${CMAKE_BINARY_DIR})
 
 get_filename_component(CUDA_COMPILER_BIN "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+get_filename_component(CUDA_DIR "${CUDA_COMPILER_BIN}" DIRECTORY)
+set(CUDA_INCLUDE "${CUDA_DIR}/include")
 
 if (NGP_OPTIX)
 	add_library(optix_program OBJECT
diff --git a/dependencies/glm b/dependencies/glm
deleted file mode 160000
index efec5db08..000000000
--- a/dependencies/glm
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit efec5db081e3aad807d0731e172ac597f6a39447
diff --git a/dependencies/tiny-cuda-nn b/dependencies/tiny-cuda-nn
index 8d2536b8b..28ca991f9 160000
--- a/dependencies/tiny-cuda-nn
+++ b/dependencies/tiny-cuda-nn
@@ -1 +1 @@
-Subproject commit 8d2536b8b324c998ff0ecec74e9d6a9c77bd45f3
+Subproject commit 28ca991f99b44d10387d73077c07ccfdd7f96275
diff --git a/include/neural-graphics-primitives/adam_optimizer.h b/include/neural-graphics-primitives/adam_optimizer.h
index d62b83908..17cee1155 100644
--- a/include/neural-graphics-primitives/adam_optimizer.h
+++ b/include/neural-graphics-primitives/adam_optimizer.h
@@ -20,7 +20,7 @@
 
 #include <json/json.hpp>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 class VarAdamOptimizer {
 public:
@@ -241,7 +241,7 @@ class RotationAdamOptimizer {
 		float actual_learning_rate = m_hparams.learning_rate * std::sqrt(1 - std::pow(m_hparams.beta2, m_state.iter)) / (1 - std::pow(m_hparams.beta1, m_state.iter));
 		m_state.first_moment = m_hparams.beta1 * m_state.first_moment + (1 - m_hparams.beta1) * gradient;
 		m_state.second_moment = m_hparams.beta2 * m_state.second_moment + (1 - m_hparams.beta2) * gradient * gradient;
-		vec3 rot = actual_learning_rate * m_state.first_moment / (sqrt(m_state.second_moment) + vec3(m_hparams.epsilon));
+		vec3 rot = actual_learning_rate * m_state.first_moment / (sqrt(m_state.second_moment) + m_hparams.epsilon);
 
 		m_state.variable = rotvec(rotmat(-rot) * rotmat(variable()));
 	}
@@ -308,4 +308,4 @@ inline void from_json(const nlohmann::json& j, RotationAdamOptimizer& opt) {
 	opt.from_json(j);
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/bounding_box.cuh b/include/neural-graphics-primitives/bounding_box.cuh
index d7a083e15..038bfb8e0 100644
--- a/include/neural-graphics-primitives/bounding_box.cuh
+++ b/include/neural-graphics-primitives/bounding_box.cuh
@@ -19,7 +19,7 @@
 #include <neural-graphics-primitives/common_device.cuh>
 #include <neural-graphics-primitives/triangle.cuh>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 template <int N_POINTS>
 NGP_HOST_DEVICE inline void project(vec3 points[N_POINTS], const vec3& axis, float& min, float& max) {
@@ -51,7 +51,7 @@ struct BoundingBox {
 		enlarge(tri.c);
 	}
 
-	BoundingBox(std::vector<Triangle>::iterator begin, std::vector<Triangle>::iterator end) {
+	NGP_HOST_DEVICE BoundingBox(Triangle* begin, Triangle* end) {
 		min = max = begin->a;
 		for (auto it = begin; it != end; ++it) {
 			enlarge(*it);
@@ -59,8 +59,8 @@ struct BoundingBox {
 	}
 
 	NGP_HOST_DEVICE void enlarge(const BoundingBox& other) {
-		min = glm::min(min, other.min);
-		max = glm::max(max, other.max);
+		min = tcnn::min(min, other.min);
+		max = tcnn::max(max, other.max);
 	}
 
 	NGP_HOST_DEVICE void enlarge(const Triangle& tri) {
@@ -70,8 +70,8 @@ struct BoundingBox {
 	}
 
 	NGP_HOST_DEVICE void enlarge(const vec3& point) {
-		min = glm::min(min, point);
-		max = glm::max(max, point);
+		min = tcnn::min(min, point);
+		max = tcnn::max(max, point);
 	}
 
 	NGP_HOST_DEVICE void inflate(float amount) {
@@ -93,8 +93,8 @@ struct BoundingBox {
 
 	NGP_HOST_DEVICE BoundingBox intersection(const BoundingBox& other) const {
 		BoundingBox result = *this;
-		result.min = glm::max(result.min, other.min);
-		result.max = glm::min(result.max, other.max);
+		result.min = tcnn::max(result.min, other.min);
+		result.max = tcnn::min(result.max, other.max);
 		return result;
 	}
 
@@ -165,14 +165,14 @@ struct BoundingBox {
 		float tmax = (max.x - pos.x) / dir.x;
 
 		if (tmin > tmax) {
-			tcnn::host_device_swap(tmin, tmax);
+			host_device_swap(tmin, tmax);
 		}
 
 		float tymin = (min.y - pos.y) / dir.y;
 		float tymax = (max.y - pos.y) / dir.y;
 
 		if (tymin > tymax) {
-			tcnn::host_device_swap(tymin, tymax);
+			host_device_swap(tymin, tymax);
 		}
 
 		if (tmin > tymax || tymin > tmax) {
@@ -191,7 +191,7 @@ struct BoundingBox {
 		float tzmax = (max.z - pos.z) / dir.z;
 
 		if (tzmin > tzmax) {
-			tcnn::host_device_swap(tzmin, tzmax);
+			host_device_swap(tzmin, tzmax);
 		}
 
 		if (tmin > tzmax || tzmin > tmax) {
@@ -210,7 +210,7 @@ struct BoundingBox {
 	}
 
 	NGP_HOST_DEVICE bool is_empty() const {
-		return any(lessThan(max, min));
+		return max.x < min.x || max.y < min.y || max.z < min.z;
 	}
 
 	NGP_HOST_DEVICE bool contains(const vec3& p) const {
@@ -226,12 +226,12 @@ struct BoundingBox {
 	}
 
 	NGP_HOST_DEVICE float distance_sq(const vec3& p) const {
-		return length2(glm::max(glm::max(min - p, p - max), vec3(0.0f)));
+		return length2(tcnn::max(tcnn::max(min - p, p - max), vec3(0.0f)));
 	}
 
 	NGP_HOST_DEVICE float signed_distance(const vec3& p) const {
 		vec3 q = abs(p - min) - diag();
-		return length(glm::max(q, vec3(0.0f))) + std::min(compMax(q), 0.0f);
+		return length(tcnn::max(q, vec3(0.0f))) + std::min(tcnn::max(q), 0.0f);
 	}
 
 	NGP_HOST_DEVICE void get_vertices(vec3 v[8]) const {
@@ -249,12 +249,4 @@ struct BoundingBox {
 	vec3 max = vec3(-std::numeric_limits<float>::infinity());
 };
 
-inline std::ostream& operator<<(std::ostream& os, const ngp::BoundingBox& bb) {
-	os << "[";
-	os << "min=[" << bb.min.x << "," << bb.min.y << "," << bb.min.z << "], ";
-	os << "max=[" << bb.max.x << "," << bb.max.y << "," << bb.max.z << "]";
-	os << "]";
-	return os;
 }
-
-NGP_NAMESPACE_END
diff --git a/include/neural-graphics-primitives/camera_path.h b/include/neural-graphics-primitives/camera_path.h
index f379177a4..c3530e912 100644
--- a/include/neural-graphics-primitives/camera_path.h
+++ b/include/neural-graphics-primitives/camera_path.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/common_host.h>
 
 #include <tiny-cuda-nn/common.h>
 
@@ -28,7 +28,7 @@
 
 struct ImDrawList;
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 struct CameraKeyframe {
 	quat R;
@@ -40,7 +40,7 @@ struct CameraKeyframe {
 	int glow_mode;
 	float glow_y_cutoff;
 	mat4x3 m() const {
-		auto rot = toMat3(normalize(quat(R)));
+		auto rot = to_mat3(normalize(quat(R)));
 		return mat4x3(rot[0], rot[1], rot[2], T);
 	}
 
@@ -113,7 +113,7 @@ struct CameraPath {
 			// add size to ensure no negative value is generated by modulo
 			return keyframes[(i + size) % size];
 		} else {
-			return keyframes[tcnn::clamp(i, 0, (int)keyframes.size()-1)];
+			return keyframes[clamp(i, 0, (int)keyframes.size()-1)];
 		}
 	}
 	CameraKeyframe eval_camera_path(float t) {
@@ -142,5 +142,5 @@ void visualize_cube(ImDrawList* list, const mat4& world2proj, const vec3& a, con
 void visualize_nerf_camera(ImDrawList* list, const mat4& world2proj, const mat4x3& xform, float aspect, uint32_t col = 0x80ffffff, float thickness = 1.0f);
 #endif
 
-NGP_NAMESPACE_END
+}
 
diff --git a/include/neural-graphics-primitives/common.h b/include/neural-graphics-primitives/common.h
index 86dec3bbd..c6f503288 100644
--- a/include/neural-graphics-primitives/common.h
+++ b/include/neural-graphics-primitives/common.h
@@ -15,83 +15,29 @@
 
 #pragma once
 
-
-#include <tinylogger/tinylogger.h>
-
-#ifdef __NVCC__
-#  ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
-#    pragma nv_diag_suppress = unsigned_compare_with_zero
-#    pragma nv_diag_suppress 20011
-#    pragma nv_diag_suppress 20014
-#  else
-#    pragma diag_suppress = unsigned_compare_with_zero
-#    pragma diag_suppress 20011
-#    pragma diag_suppress 20014
-#  endif
-#endif
-
-// For glm swizzles to work correctly, Microsoft extensions
-// need to be enabled. This is done by the -fms-extensions
-// flag (see CMakeLists.txt), and the following macro needs
-// to be defined such that GLM is aware of this.
-#ifndef _MSC_EXTENSIONS
-#define _MSC_EXTENSIONS
+#ifdef _WIN32
+#  define NOMINMAX
 #endif
 
-#define GLM_FORCE_SWIZZLE
-#include <glm/glm.hpp>
-#include <glm/gtx/norm.hpp>
-#include <glm/gtc/quaternion.hpp>
-#include <glm/gtx/component_wise.hpp>
-#include <glm/gtc/matrix_access.hpp>
-using namespace glm;
+#include <tiny-cuda-nn/common.h>
+using namespace tcnn;
 
-#define NGP_NAMESPACE_BEGIN namespace ngp {
-#define NGP_NAMESPACE_END }
 
 #if defined(__CUDA_ARCH__)
-	#if defined(__CUDACC_RTC__) || (defined(__clang__) && defined(__CUDA__))
-		#define NGP_PRAGMA_UNROLL _Pragma("unroll")
-		#define NGP_PRAGMA_NO_UNROLL _Pragma("unroll 1")
-	#else
-		#define NGP_PRAGMA_UNROLL #pragma unroll
-		#define NGP_PRAGMA_NO_UNROLL #pragma unroll 1
-	#endif
+	#define NGP_PRAGMA_UNROLL _Pragma("unroll")
+	#define NGP_PRAGMA_NO_UNROLL _Pragma("unroll 1")
 #else
 	#define NGP_PRAGMA_UNROLL
 	#define NGP_PRAGMA_NO_UNROLL
 #endif
 
-#include <filesystem/path.h>
-
-#include <chrono>
-#include <functional>
-
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+#if defined(__CUDACC__) || (defined(__clang__) && defined(__CUDA__))
 #define NGP_HOST_DEVICE __host__ __device__
 #else
 #define NGP_HOST_DEVICE
 #endif
 
-NGP_NAMESPACE_BEGIN
-
-namespace fs = filesystem;
-
-bool is_wsl();
-
-fs::path get_executable_dir();
-fs::path get_root_dir();
-
-#ifdef _WIN32
-std::string utf16_to_utf8(const std::wstring& utf16);
-std::wstring utf8_to_utf16(const std::string& utf16);
-std::wstring native_string(const fs::path& path);
-#else
-std::string native_string(const fs::path& path);
-#endif
-
-bool ends_with(const std::string& str, const std::string& ending);
-bool ends_with_case_insensitive(const std::string& str, const std::string& ending);
+namespace ngp {
 
 enum class EMeshRenderMode : int {
 	Off,
@@ -191,10 +137,6 @@ enum class ETestbedMode : int {
 	None,
 };
 
-ETestbedMode mode_from_scene(const std::string& scene);
-ETestbedMode mode_from_string(const std::string& str);
-std::string to_string(ETestbedMode);
-
 enum class EMlpAlgorithm : int {
 	MMA,
 	FMA,
@@ -234,7 +176,7 @@ struct Ray {
 };
 
 struct TrainingXForm {
-	bool operator==(const TrainingXForm& other) const {
+	NGP_HOST_DEVICE bool operator==(const TrainingXForm& other) const {
 		return start == other.start && end == other.end;
 	}
 
@@ -252,7 +194,7 @@ enum class ELensMode : int {
 };
 static constexpr const char* LensModeStr = "Perspective\0OpenCV\0F-Theta\0LatLong\0OpenCV Fisheye\0Equirectangular\0\0";
 
-inline bool supports_dlss(ELensMode mode) {
+inline NGP_HOST_DEVICE bool supports_dlss(ELensMode mode) {
 	return mode == ELensMode::Perspective || mode == ELensMode::OpenCV || mode == ELensMode::OpenCVFisheye;
 }
 
@@ -261,10 +203,6 @@ struct Lens {
 	float params[7] = {};
 };
 
-inline NGP_HOST_DEVICE float sign(float x) {
-	return copysignf(1.0, x);
-}
-
 inline NGP_HOST_DEVICE uint32_t binary_search(float val, const float* data, uint32_t length) {
 	if (length == 0) {
 		return 0;
@@ -287,115 +225,41 @@ inline NGP_HOST_DEVICE uint32_t binary_search(float val, const float* data, uint
 		}
 	}
 
-	return std::min(first, length-1);
-}
-
-inline std::string replace_all(std::string str, const std::string& a, const std::string& b) {
-	std::string::size_type n = 0;
-	while ((n = str.find(a, n)) != std::string::npos) {
-		str.replace(n, a.length(), b);
-		n += b.length();
-	}
-	return str;
-}
-
-template <typename T>
-std::string join(const T& components, const std::string& delim) {
-	std::ostringstream s;
-	for (const auto& component : components) {
-		if (&components[0] != &component) {
-			s << delim;
-		}
-		s << component;
-	}
-
-	return s.str();
+	return min(first, length-1);
 }
 
-enum class EEmaType {
-	Time,
-	Step,
-};
-
-class Ema {
-public:
-	Ema(EEmaType type, float half_life)
-	: m_type{type}, m_decay{std::pow(0.5f, 1.0f / half_life)}, m_creation_time{std::chrono::steady_clock::now()} {}
-
-	int64_t current_progress() {
-		if (m_type == EEmaType::Time) {
-			auto now = std::chrono::steady_clock::now();
-			return std::chrono::duration_cast<std::chrono::milliseconds>(now - m_creation_time).count();
-		} else {
-			return m_last_progress + 1;
-		}
-	}
-
-	void update(float val) {
-		int64_t cur = current_progress();
-		int64_t elapsed = cur - m_last_progress;
-		m_last_progress = cur;
-
-		float decay = std::pow(m_decay, elapsed);
-		m_val = val;
-		m_ema_val = decay * m_ema_val + (1.0f - decay) * val;
-	}
-
-	void set(float val) {
-		m_last_progress = current_progress();
-		m_val = m_ema_val = val;
-	}
-
-	float val() const {
-		return m_val;
-	}
-
-	float ema_val() const {
-		return m_ema_val;
-	}
-
-private:
-	float m_val = 0.0f;
-	float m_ema_val = 0.0f;
-	EEmaType m_type;
-	float m_decay;
-
-	int64_t m_last_progress = 0;
-	std::chrono::time_point<std::chrono::steady_clock> m_creation_time;
-};
-
 template <typename T>
 struct Buffer2DView {
 	T* data = nullptr;
-	ivec2 resolution = ivec2(0);
+	ivec2 resolution = 0;
 
 	// Lookup via integer pixel position (no bounds checking)
-	NGP_HOST_DEVICE T at(const ivec2& xy) const {
-		return data[xy.x + xy.y * resolution.x];
+	NGP_HOST_DEVICE T at(const ivec2& px) const {
+		return data[px.x + px.y * resolution.x];
 	}
 
 	// Lookup via UV coordinates in [0,1]^2
 	NGP_HOST_DEVICE T at(const vec2& uv) const {
-		ivec2 xy = clamp(ivec2(vec2(resolution) * uv), ivec2(0), resolution - ivec2(1));
-		return at(xy);
+		ivec2 px = clamp(ivec2(vec2(resolution) * uv), 0, resolution - 1);
+		return at(px);
 	}
 
 	// Lookup via UV coordinates in [0,1]^2 and LERP the nearest texels
 	NGP_HOST_DEVICE T at_lerp(const vec2& uv) const {
-		const vec2 xy_float = vec2(resolution) * uv;
-		const ivec2 xy = ivec2(xy_float);
+		const vec2 px_float = vec2(resolution) * uv;
+		const ivec2 px = ivec2(px_float);
 
-		const vec2 weight = xy_float - vec2(xy);
+		const vec2 weight = px_float - vec2(px);
 
 		auto read_val = [&](ivec2 pos) {
-			return at(clamp(pos, ivec2(0), resolution - ivec2(1)));
+			return at(clamp(pos, 0, resolution - 1));
 		};
 
 		return (
-			(1 - weight.x) * (1 - weight.y) * read_val({xy.x, xy.y}) +
-			(weight.x) * (1 - weight.y) * read_val({xy.x+1, xy.y}) +
-			(1 - weight.x) * (weight.y) * read_val({xy.x, xy.y+1}) +
-			(weight.x) * (weight.y) * read_val({xy.x+1, xy.y+1})
+			(1 - weight.x) * (1 - weight.y) * read_val({px.x, px.y}) +
+			(weight.x) * (1 - weight.y) * read_val({px.x+1, px.y}) +
+			(1 - weight.x) * (weight.y) * read_val({px.x, px.y+1}) +
+			(weight.x) * (weight.y) * read_val({px.x+1, px.y+1})
 		);
 	}
 
@@ -404,12 +268,4 @@ struct Buffer2DView {
 	}
 };
 
-uint8_t* load_stbi(const fs::path& path, int* width, int* height, int* comp, int req_comp);
-float* load_stbi_float(const fs::path& path, int* width, int* height, int* comp, int req_comp);
-uint16_t* load_stbi_16(const fs::path& path, int* width, int* height, int* comp, int req_comp);
-bool is_hdr_stbi(const fs::path& path);
-int write_stbi(const fs::path& path, int width, int height, int comp, const uint8_t* pixels, int quality = 100);
-
-FILE* native_fopen(const fs::path& path, const char* mode);
-
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/common_device.cuh b/include/neural-graphics-primitives/common_device.cuh
index d70d4fe83..dfcdbb791 100644
--- a/include/neural-graphics-primitives/common_device.cuh
+++ b/include/neural-graphics-primitives/common_device.cuh
@@ -19,11 +19,10 @@
 #include <neural-graphics-primitives/random_val.cuh>
 
 #include <tiny-cuda-nn/common.h>
-#include <tiny-cuda-nn/gpu_memory.h>
 
-NGP_NAMESPACE_BEGIN
+#include <cassert>
 
-using precision_t = tcnn::network_precision_t;
+namespace ngp {
 
 
 // The maximum depth that can be produced when rendering a frame.
@@ -32,51 +31,11 @@ using precision_t = tcnn::network_precision_t;
 // even when rendering the infinitely distant horizon.
 inline constexpr __device__ float MAX_DEPTH() { return 16384.0f; }
 
-template <typename T>
-class Buffer2D {
-public:
-	Buffer2D() = default;
-	Buffer2D(const ivec2& resolution) {
-		resize(resolution);
-	}
-
-	T* data() const {
-		return m_data.data();
-	}
-
-	size_t bytes() const {
-		return m_data.bytes();
-	}
-
-	void resize(const ivec2& resolution) {
-		m_data.resize(compMul(resolution));
-		m_resolution = resolution;
-	}
-
-	const ivec2& resolution() const {
-		return m_resolution;
-	}
-
-	Buffer2DView<T> view() const {
-		// Row major for now.
-		return {data(), m_resolution};
-	}
-
-	Buffer2DView<const T> const_view() const {
-		// Row major for now.
-		return {data(), m_resolution};
-	}
-
-private:
-	tcnn::GPUMemory<T> m_data;
-	ivec2 m_resolution;
-};
-
 inline NGP_HOST_DEVICE float srgb_to_linear(float srgb) {
 	if (srgb <= 0.04045f) {
 		return srgb / 12.92f;
 	} else {
-		return std::pow((srgb + 0.055f) / 1.055f, 2.4f);
+		return pow((srgb + 0.055f) / 1.055f, 2.4f);
 	}
 }
 
@@ -88,7 +47,7 @@ inline NGP_HOST_DEVICE float srgb_to_linear_derivative(float srgb) {
 	if (srgb <= 0.04045f) {
 		return 1.0f / 12.92f;
 	} else {
-		return 2.4f / 1.055f * std::pow((srgb + 0.055f) / 1.055f, 1.4f);
+		return 2.4f / 1.055f * pow((srgb + 0.055f) / 1.055f, 1.4f);
 	}
 }
 
@@ -100,7 +59,7 @@ inline NGP_HOST_DEVICE float linear_to_srgb(float linear) {
 	if (linear < 0.0031308f) {
 		return 12.92f * linear;
 	} else {
-		return 1.055f * std::pow(linear, 0.41666f) - 0.055f;
+		return 1.055f * pow(linear, 0.41666f) - 0.055f;
 	}
 }
 
@@ -112,7 +71,7 @@ inline NGP_HOST_DEVICE float linear_to_srgb_derivative(float linear) {
 	if (linear < 0.0031308f) {
 		return 12.92f;
 	} else {
-		return 1.055f * 0.41666f * std::pow(linear, 0.41666f - 1.0f);
+		return 1.055f * 0.41666f * pow(linear, 0.41666f - 1.0f);
 	}
 }
 
@@ -130,8 +89,8 @@ __device__ void deposit_image_gradient(const vec2& value, T* __restrict__ gradie
 	constexpr uint32_t N_DIMS = 2;
 
 	auto deposit_val = [&](const vec2& value, T weight, ivec2 pos) {
-		pos.x = std::max(std::min(pos.x, resolution.x-1), 0);
-		pos.y = std::max(std::min(pos.y, resolution.y-1), 0);
+		pos.x = max(min(pos.x, resolution.x-1), 0);
+		pos.y = max(min(pos.y, resolution.y-1), 0);
 
 #if TCNN_MIN_GPU_ARCH >= 60 // atomicAdd(__half2) is only supported with compute capability 60 and above
 		if (std::is_same<T, __half>::value) {
@@ -157,7 +116,7 @@ __device__ void deposit_image_gradient(const vec2& value, T* __restrict__ gradie
 
 struct FoveationPiecewiseQuadratic {
 	FoveationPiecewiseQuadratic() = default;
-	FoveationPiecewiseQuadratic(float center_pixel_steepness, float center_inverse_piecewise_y, float center_radius) {
+	NGP_HOST_DEVICE FoveationPiecewiseQuadratic(float center_pixel_steepness, float center_inverse_piecewise_y, float center_radius) {
 		float center_inverse_radius = center_radius * center_pixel_steepness;
 		float left_inverse_piecewise_switch = center_inverse_piecewise_y - center_inverse_radius;
 		float right_inverse_piecewise_switch = center_inverse_piecewise_y + center_inverse_radius;
@@ -232,7 +191,7 @@ struct FoveationPiecewiseQuadratic {
 	float inv_switch_left = 0.0f, inv_switch_right = 1.0f;
 
 	NGP_HOST_DEVICE float warp(float x) const {
-		x = tcnn::clamp(x, 0.0f, 1.0f);
+		x = clamp(x, 0.0f, 1.0f);
 		if (x < switch_left) {
 			return al * x * x + bl * x + cl;
 		} else if (x > switch_right) {
@@ -243,18 +202,18 @@ struct FoveationPiecewiseQuadratic {
 	}
 
 	NGP_HOST_DEVICE float unwarp(float y) const {
-		y = tcnn::clamp(y, 0.0f, 1.0f);
+		y = clamp(y, 0.0f, 1.0f);
 		if (y < inv_switch_left) {
-			return (std::sqrt(-4 * al * cl + 4 * al * y + bl * bl) - bl) / (2 * al);
+			return (sqrt(-4 * al * cl + 4 * al * y + bl * bl) - bl) / (2 * al);
 		} else if (y > inv_switch_right) {
-			return (std::sqrt(-4 * ar * cr + 4 * ar * y + br * br) - br) / (2 * ar);
+			return (sqrt(-4 * ar * cr + 4 * ar * y + br * br) - br) / (2 * ar);
 		} else {
 			return (y - bm) / am;
 		}
 	}
 
 	NGP_HOST_DEVICE float density(float x) const {
-		x = tcnn::clamp(x, 0.0f, 1.0f);
+		x = clamp(x, 0.0f, 1.0f);
 		if (x < switch_left) {
 			return 2 * al * x + bl;
 		} else if (x > switch_right) {
@@ -268,7 +227,7 @@ struct FoveationPiecewiseQuadratic {
 struct Foveation {
 	Foveation() = default;
 
-	Foveation(const vec2& center_pixel_steepness, const vec2& center_inverse_piecewise_y, const vec2& center_radius)
+	NGP_HOST_DEVICE Foveation(const vec2& center_pixel_steepness, const vec2& center_inverse_piecewise_y, const vec2& center_radius)
 	: warp_x{center_pixel_steepness.x, center_inverse_piecewise_y.x, center_radius.x}, warp_y{center_pixel_steepness.y, center_inverse_piecewise_y.y, center_radius.y} {}
 
 	FoveationPiecewiseQuadratic warp_x, warp_y;
@@ -309,10 +268,10 @@ NGP_HOST_DEVICE inline void opencv_fisheye_lens_distortion_delta(const T* extra_
 	const T k3 = extra_params[2];
 	const T k4 = extra_params[3];
 
-	const T r = std::sqrt(u * u + v * v);
+	const T r = sqrt(u * u + v * v);
 
-	if (r > T(std::numeric_limits<double>::epsilon())) {
-		const T theta = std::atan(r);
+	if (r > (T)std::numeric_limits<double>::epsilon()) {
+		const T theta = atan(r);
 		const T theta2 = theta * theta;
 		const T theta4 = theta2 * theta2;
 		const T theta6 = theta4 * theta2;
@@ -337,8 +296,8 @@ NGP_HOST_DEVICE inline void iterative_lens_undistortion(const T* params, T* u, T
 	const float kRelStepSize = 1e-6f;
 
 	mat2 J;
-	const vec2 x0(*u, *v);
-	vec2 x(*u, *v);
+	const vec2 x0{*u, *v};
+	vec2 x{*u, *v};
 	vec2 dx;
 	vec2 dx_0b;
 	vec2 dx_0f;
@@ -346,8 +305,8 @@ NGP_HOST_DEVICE inline void iterative_lens_undistortion(const T* params, T* u, T
 	vec2 dx_1f;
 
 	for (uint32_t i = 0; i < kNumIterations; ++i) {
-		const float step0 = std::max(std::numeric_limits<float>::epsilon(), std::abs(kRelStepSize * x[0]));
-		const float step1 = std::max(std::numeric_limits<float>::epsilon(), std::abs(kRelStepSize * x[1]));
+		const float step0 = max(std::numeric_limits<float>::epsilon(), abs(kRelStepSize * x[0]));
+		const float step1 = max(std::numeric_limits<float>::epsilon(), abs(kRelStepSize * x[1]));
 		distortion_fun(params, x[0], x[1], &dx[0], &dx[1]);
 		distortion_fun(params, x[0] - step0, x[1], &dx_0b[0], &dx_0b[1]);
 		distortion_fun(params, x[0] + step0, x[1], &dx_0f[0], &dx_0f[1]);
@@ -402,7 +361,7 @@ inline NGP_HOST_DEVICE mat4x3 get_xform_given_rolling_shutter(const TrainingXFor
 	float pixel_t = rolling_shutter.x + rolling_shutter.y * uv.x + rolling_shutter.z * uv.y + rolling_shutter.w * motionblur_time;
 
 	vec3 pos = training_xform.start[3] + (training_xform.end[3] - training_xform.start[3]) * pixel_t;
-	mat3 rot = toMat3(normalize(slerp(fquat(mat3(training_xform.start)), fquat(mat3(training_xform.end)), pixel_t)));
+	mat3 rot = to_mat3(normalize(slerp(quat(mat3(training_xform.start)), quat(mat3(training_xform.end)), pixel_t)));
 
 	return mat4x3(rot[0], rot[1], rot[2], pos);
 }
@@ -433,7 +392,7 @@ inline NGP_HOST_DEVICE vec3 latlong_to_dir(const vec2& uv) {
 
 inline NGP_HOST_DEVICE vec3 equirectangular_to_dir(const vec2& uv) {
 	float ct = (uv.y - 0.5f) * 2.0f;
-	float st = std::sqrt(std::max(1.0f - ct * ct, 0.0f));
+	float st = sqrt(max(1.0f - ct * ct, 0.0f));
 	float phi = (uv.x - 0.5f) * PI() * 2.0f;
 	float sp, cp;
 	sincosf(phi, &sp, &cp);
@@ -489,7 +448,7 @@ inline NGP_HOST_DEVICE Ray uv_to_ray(
 	}
 
 	if (distortion) {
-		dir.xy += distortion.at_lerp(warped_uv);
+		dir.xy() += distortion.at_lerp(warped_uv);
 	}
 
 	vec3 head_pos = {parallax_shift.x, parallax_shift.y, 0.f};
@@ -500,7 +459,7 @@ inline NGP_HOST_DEVICE Ray uv_to_ray(
 	if (aperture_size != 0.0f) {
 		vec3 lookat = origin + dir * focus_z;
 		auto px = ivec2(uv * vec2(resolution));
-		vec2 blur = aperture_size * square2disk_shirley(ld_random_val_2d(spp, px.x * 19349663 + px.y * 96925573) * 2.0f - vec2(1.0f));
+		vec2 blur = aperture_size * square2disk_shirley(ld_random_val_2d(spp, px.x * 19349663 + px.y * 96925573) * 2.0f - 1.0f);
 		origin += mat2x3(camera_matrix) * blur;
 		dir = (lookat - origin) / focus_z;
 	}
@@ -576,7 +535,7 @@ inline NGP_HOST_DEVICE vec2 pos_to_uv(
 	dir.x += du;
 	dir.y += dv;
 
-	vec2 uv = dir.xy * focal_length / vec2(resolution) + screen_center;
+	vec2 uv = dir.xy() * focal_length / vec2(resolution) + screen_center;
 	return foveation.unwarp(uv);
 }
 
@@ -648,11 +607,11 @@ inline NGP_HOST_DEVICE vec2 motion_vector(
 // and VR reprojection.
 inline NGP_HOST_DEVICE float to_ndc_depth(float z, float n, float f) {
 	// View depth outside of the view frustum leads to output outside of [0, 1]
-	z = tcnn::clamp(z, n, f);
+	z = clamp(z, n, f);
 
 	float scale = n / (n - f);
 	float bias = -f * scale;
-	return tcnn::clamp((z * scale + bias) / z, 0.0f, 1.0f);
+	return clamp((z * scale + bias) / z, 0.0f, 1.0f);
 }
 
 inline NGP_HOST_DEVICE float fov_to_focal_length(int resolution, float degrees) {
@@ -695,54 +654,13 @@ inline NGP_HOST_DEVICE vec2 to_vec2(const float2& x) {
 	return {x.x, x.y};
 }
 
-inline NGP_HOST_DEVICE vec3 rotvec(const mat3& mat) {
-	quat tmp = mat;
-	return axis(tmp) * angle(tmp);
+inline NGP_HOST_DEVICE mat4x3 camera_log_lerp(const mat4x3& a, const mat4x3& b, float t) {
+	return mat_exp(mat_log(mat4(b) * inverse(mat4(a))) * t) * mat4(a);
 }
 
-inline NGP_HOST_DEVICE mat3 rotmat(float angle, const vec3& axis) {
-	float s, c;
-	sincosf(angle, &s, &c);
-	float oc = 1.0f - c;
-
-	return mat3(
-		oc * axis.x * axis.x + c,          oc * axis.x * axis.y + axis.z * s, oc * axis.z * axis.x - axis.y * s,
-		oc * axis.x * axis.y - axis.z * s, oc * axis.y * axis.y + c,          oc * axis.y * axis.z + axis.x * s,
-		oc * axis.z * axis.x + axis.y * s, oc * axis.y * axis.z - axis.x * s, oc * axis.z * axis.z + c
-	);
-}
-
-inline NGP_HOST_DEVICE mat3 rotmat(const vec3& vec) {
-	float angle = length(vec);
-	if (angle == 0.0f) {
-		return mat3(1.0f);
-	}
-
-	return rotmat(angle, vec / angle);
-}
-
-inline NGP_HOST_DEVICE mat3 slerp(const mat3& a, const mat3& b, float t) {
-	return toMat3(slerp(quat(a), quat(b), t));
-}
-
-inline NGP_HOST_DEVICE float norm(const mat4x3& mat) {
-	return sqrt(length2(mat[0]) + length2(mat[1]) + length2(mat[2]) + length2(mat[3]));
-}
-
-inline NGP_HOST_DEVICE bool isfinite(float v) {
-	return std::isfinite(v);
-}
-
-inline NGP_HOST_DEVICE bvec2 isfinite(const vec2& v) {
-	return bvec2(std::isfinite(v.x), std::isfinite(v.y));
-}
-
-inline NGP_HOST_DEVICE bvec3 isfinite(const vec3& v) {
-	return bvec3(std::isfinite(v.x), std::isfinite(v.y), std::isfinite(v.z));
-}
-
-inline NGP_HOST_DEVICE bvec4 isfinite(const vec4& v) {
-	return bvec4(std::isfinite(v.x), std::isfinite(v.y), std::isfinite(v.z), std::isfinite(v.w));
+inline NGP_HOST_DEVICE mat4x3 camera_slerp(const mat4x3& a, const mat4x3& b, float t) {
+	mat3 rot = slerp(mat3(a), mat3(b), t);
+	return {rot[0], rot[1], rot[2], mix(a[3], b[3], t)};
 }
 
 inline NGP_HOST_DEVICE void apply_quilting(uint32_t* x, uint32_t* y, const ivec2& resolution, vec3& parallax_shift, const ivec2& quilting_dims) {
@@ -784,7 +702,7 @@ __global__ void from_rgba32(const uint64_t num_pixels, const uint8_t* __restrict
 		alpha = 0.f;
 	}
 
-	tcnn::vector_t<T, 4> rgba_out;
+	tvec<T, 4> rgba_out;
 	rgba_out[0] = (T)(srgb_to_linear(rgba[0] * (1.0f/255.0f)) * alpha);
 	rgba_out[1] = (T)(srgb_to_linear(rgba[1] * (1.0f/255.0f)) * alpha);
 	rgba_out[2] = (T)(srgb_to_linear(rgba[2] * (1.0f/255.0f)) * alpha);
@@ -794,10 +712,9 @@ __global__ void from_rgba32(const uint64_t num_pixels, const uint8_t* __restrict
 		rgba_out[0] = rgba_out[1] = rgba_out[2] = rgba_out[3] = (T)-1.0f;
 	}
 
-	*((tcnn::vector_t<T, 4>*)&out[i*4]) = rgba_out;
+	*((tvec<T, 4>*)&out[i*4]) = rgba_out;
 }
 
-
 // Foley & van Dam p593 / http://en.wikipedia.org/wiki/HSL_and_HSV
 inline NGP_HOST_DEVICE vec3 hsv_to_rgb(const vec3& hsv) {
 	float h = hsv.x, s = hsv.y, v = hsv.z;
@@ -839,24 +756,24 @@ enum class EDepthDataType {
 };
 
 inline NGP_HOST_DEVICE ivec2 image_pos(const vec2& pos, const ivec2& resolution) {
-	return clamp(ivec2(pos * vec2(resolution)), ivec2(0), resolution - ivec2(1));
+	return clamp(ivec2(pos * vec2(resolution)), 0, resolution - 1);
 }
 
-inline NGP_HOST_DEVICE uint64_t pixel_idx(const ivec2& pos, const ivec2& resolution, uint32_t img) {
-	return pos.x + pos.y * resolution.x + img * (uint64_t)resolution.x * resolution.y;
+inline NGP_HOST_DEVICE uint64_t pixel_idx(const ivec2& px, const ivec2& resolution, uint32_t img) {
+	return px.x + px.y * resolution.x + img * (uint64_t)resolution.x * resolution.y;
 }
 
-inline NGP_HOST_DEVICE uint64_t pixel_idx(const vec2& xy, const ivec2& resolution, uint32_t img) {
-	return pixel_idx(image_pos(xy, resolution), resolution, img);
+inline NGP_HOST_DEVICE uint64_t pixel_idx(const vec2& uv, const ivec2& resolution, uint32_t img) {
+	return pixel_idx(image_pos(uv, resolution), resolution, img);
 }
 
 // inline NGP_HOST_DEVICE vec3 composit_and_lerp(vec2 pos, const ivec2& resolution, uint32_t img, const __half* training_images, const vec3& background_color, const vec3& exposure_scale = vec3(1.0f)) {
-// 	pos = (pos.cwiseProduct(vec2(resolution)) - vec2(0.5f)).cwiseMax(0.0f).cwiseMin(vec2(resolution) - vec2(1.0f + 1e-4f));
+// 	pos = (pos.cwiseProduct(vec2(resolution)) - 0.5f).cwiseMax(0.0f).cwiseMin(vec2(resolution) - (1.0f + 1e-4f));
 
 // 	const ivec2 pos_int = pos.cast<int>();
 // 	const vec2 weight = pos - pos_int.cast<float>();
 
-// 	const ivec2 idx = pos_int.cwiseMin(resolution - ivec2(2)).cwiseMax(0);
+// 	const ivec2 idx = pos_int.cwiseMin(resolution - 2).cwiseMax(0);
 
 // 	auto read_val = [&](const ivec2& p) {
 // 		__half val[4];
@@ -905,7 +822,7 @@ inline NGP_HOST_DEVICE vec4 read_rgba(ivec2 px, const ivec2& resolution, const v
 		case EImageDataType::Half: {
 			__half val[4];
 			*(uint64_t*)&val[0] = ((uint64_t*)pixels)[pixel_idx(px, resolution, img)];
-			return vec4{val[0], val[1], val[2], val[3]};
+			return vec4{(float)val[0], (float)val[1], (float)val[2], (float)val[3]};
 		}
 		case EImageDataType::Float:
 			return ((vec4*)pixels)[pixel_idx(px, resolution, img)];
@@ -924,10 +841,13 @@ inline NGP_HOST_DEVICE float read_depth(vec2 pos, const ivec2& resolution, const
 	return read_val(image_pos(pos, resolution));
 }
 
-mat4x3 camera_log_lerp(const mat4x3& begin, const mat4x3& end, float t);
-mat4x3 camera_slerp(const mat4x3& begin, const mat4x3& end, float t);
+inline __device__ int float_to_ordered_int(float f) {
+	int i = __float_as_int(f);
+	return (i >= 0 ) ? i : i ^ 0x7FFFFFFF;
+}
 
-tcnn::GPUMemory<float> load_exr_gpu(const fs::path& path, int* width, int* height);
-tcnn::GPUMemory<float> load_stbi_gpu(const fs::path& path, int* width, int* height);
+inline __device__ float ordered_int_to_float(int i) {
+	return __int_as_float(i >= 0 ? i : i ^ 0x7FFFFFFF);
+}
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/common_host.h b/include/neural-graphics-primitives/common_host.h
new file mode 100644
index 000000000..8769a6beb
--- /dev/null
+++ b/include/neural-graphics-primitives/common_host.h
@@ -0,0 +1,170 @@
+/*
+* Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+*
+* NVIDIA CORPORATION and its licensors retain all intellectual property
+* and proprietary rights in and to this software, related documentation
+* and any modifications thereto.  Any use, reproduction, disclosure or
+* distribution of this software and related documentation without an express
+* license agreement from NVIDIA CORPORATION is strictly prohibited.
+*/
+
+/** @file   common_host.h
+ *  @author Thomas Müller, NVIDIA
+ *  @brief  Shared functionality among multiple neural-graphics-primitives components.
+ */
+
+#pragma once
+
+#include <neural-graphics-primitives/common.h>
+
+#include <filesystem/path.h>
+
+#include <tiny-cuda-nn/gpu_memory.h>
+
+#include <tinylogger/tinylogger.h>
+
+#include <chrono>
+#include <functional>
+
+namespace ngp {
+
+namespace fs = filesystem;
+
+bool is_wsl();
+
+fs::path discover_executable_dir();
+fs::path discover_root_dir();
+
+#ifdef _WIN32
+std::string utf16_to_utf8(const std::wstring& utf16);
+std::wstring utf8_to_utf16(const std::string& utf16);
+std::wstring native_string(const fs::path& path);
+#else
+std::string native_string(const fs::path& path);
+#endif
+
+bool ends_with(const std::string& str, const std::string& ending);
+bool ends_with_case_insensitive(const std::string& str, const std::string& ending);
+
+ETestbedMode mode_from_scene(const std::string& scene);
+ETestbedMode mode_from_string(const std::string& str);
+std::string to_string(ETestbedMode);
+
+inline std::string replace_all(std::string str, const std::string& a, const std::string& b) {
+	std::string::size_type n = 0;
+	while ((n = str.find(a, n)) != std::string::npos) {
+		str.replace(n, a.length(), b);
+		n += b.length();
+	}
+	return str;
+}
+
+enum class EEmaType {
+	Time,
+	Step,
+};
+
+class Ema {
+public:
+	Ema(EEmaType type, float half_life)
+	: m_type{type}, m_decay{std::pow(0.5f, 1.0f / half_life)}, m_creation_time{std::chrono::steady_clock::now()} {}
+
+	int64_t current_progress() {
+		if (m_type == EEmaType::Time) {
+			auto now = std::chrono::steady_clock::now();
+			return std::chrono::duration_cast<std::chrono::milliseconds>(now - m_creation_time).count();
+		} else {
+			return m_last_progress + 1;
+		}
+	}
+
+	void update(float val) {
+		int64_t cur = current_progress();
+		int64_t elapsed = cur - m_last_progress;
+		m_last_progress = cur;
+
+		float decay = std::pow(m_decay, elapsed);
+		m_val = val;
+		m_ema_val = decay * m_ema_val + (1.0f - decay) * val;
+	}
+
+	void set(float val) {
+		m_last_progress = current_progress();
+		m_val = m_ema_val = val;
+	}
+
+	float val() const {
+		return m_val;
+	}
+
+	float ema_val() const {
+		return m_ema_val;
+	}
+
+private:
+	float m_val = 0.0f;
+	float m_ema_val = 0.0f;
+	EEmaType m_type;
+	float m_decay;
+
+	int64_t m_last_progress = 0;
+	std::chrono::time_point<std::chrono::steady_clock> m_creation_time;
+};
+
+uint8_t* load_stbi(const fs::path& path, int* width, int* height, int* comp, int req_comp);
+float* load_stbi_float(const fs::path& path, int* width, int* height, int* comp, int req_comp);
+uint16_t* load_stbi_16(const fs::path& path, int* width, int* height, int* comp, int req_comp);
+bool is_hdr_stbi(const fs::path& path);
+int write_stbi(const fs::path& path, int width, int height, int comp, const uint8_t* pixels, int quality = 100);
+
+FILE* native_fopen(const fs::path& path, const char* mode);
+
+GPUMemory<float> load_exr_gpu(const fs::path& path, int* width, int* height);
+GPUMemory<float> load_stbi_gpu(const fs::path& path, int* width, int* height);
+
+template <typename T>
+class Buffer2D {
+public:
+	Buffer2D() = default;
+	Buffer2D(const ivec2& resolution) {
+		resize(resolution);
+	}
+
+	T* data() const {
+		return m_data.data();
+	}
+
+	size_t bytes() const {
+		return m_data.bytes();
+	}
+
+	void resize(const ivec2& resolution) {
+		m_data.resize(product(resolution));
+		m_resolution = resolution;
+	}
+
+	const ivec2& resolution() const {
+		return m_resolution;
+	}
+
+	Buffer2DView<T> view() const {
+		// Row major for now.
+		return {data(), m_resolution};
+	}
+
+	Buffer2DView<const T> const_view() const {
+		// Row major for now.
+		return {data(), m_resolution};
+	}
+
+private:
+	GPUMemory<T> m_data;
+	ivec2 m_resolution;
+};
+
+struct BoundingBox;
+struct Triangle;
+std::ostream& operator<<(std::ostream& os, const BoundingBox& triangle);
+std::ostream& operator<<(std::ostream& os, const Triangle& triangle);
+
+}
diff --git a/include/neural-graphics-primitives/discrete_distribution.h b/include/neural-graphics-primitives/discrete_distribution.h
index 32fda8b40..0e740ae53 100644
--- a/include/neural-graphics-primitives/discrete_distribution.h
+++ b/include/neural-graphics-primitives/discrete_distribution.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
-NGP_NAMESPACE_BEGIN
+#include <vector>
+
+namespace ngp {
 
 struct DiscreteDistribution {
 	void build(std::vector<float> weights) {
@@ -43,4 +45,4 @@ struct DiscreteDistribution {
 	std::vector<float> cdf;
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/dlss.h b/include/neural-graphics-primitives/dlss.h
index 7d10fc23b..39c971b7f 100644
--- a/include/neural-graphics-primitives/dlss.h
+++ b/include/neural-graphics-primitives/dlss.h
@@ -18,7 +18,7 @@
 
 #include <memory>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 class IDlss {
 public:
@@ -64,4 +64,4 @@ class IDlssProvider {
 std::shared_ptr<IDlssProvider> init_vulkan_and_ngx();
 #endif
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/envmap.cuh b/include/neural-graphics-primitives/envmap.cuh
index daa5ad9ef..c2ae325f8 100644
--- a/include/neural-graphics-primitives/envmap.cuh
+++ b/include/neural-graphics-primitives/envmap.cuh
@@ -15,16 +15,11 @@
 
 #pragma once
 
-#include <neural-graphics-primitives/common.h>
 #include <neural-graphics-primitives/common_device.cuh>
 
 #include <tiny-cuda-nn/common.h>
 
-#include <tiny-cuda-nn/gpu_matrix.h>
-#include <tiny-cuda-nn/gpu_memory.h>
-#include <tiny-cuda-nn/network.h>
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 inline __device__ vec4 read_envmap(const Buffer2DView<const vec4>& envmap, const vec3& dir) {
 	auto dir_cyl = dir_to_spherical_unorm({dir.z, -dir.x, dir.y});
@@ -40,7 +35,7 @@ inline __device__ vec4 read_envmap(const Buffer2DView<const vec4>& envmap, const
 		} else if (pos.x >= envmap.resolution.x) {
 			pos.x -= envmap.resolution.x;
 		}
-		pos.y = std::max(std::min(pos.y, envmap.resolution.y-1), 0);
+		pos.y = max(min(pos.y, envmap.resolution.y-1), 0);
 		return envmap.at(pos);
 	};
 
@@ -55,7 +50,7 @@ inline __device__ vec4 read_envmap(const Buffer2DView<const vec4>& envmap, const
 }
 
 template <typename T, typename GRAD_T>
-__device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, GRAD_T* __restrict__ envmap_gradient, const ivec2 envmap_resolution, const vec3& dir) {
+__device__ void deposit_envmap_gradient(const tvec<T, 4>& value, GRAD_T* __restrict__ envmap_gradient, const ivec2 envmap_resolution, const vec3& dir) {
 	auto dir_cyl = dir_to_spherical_unorm({dir.z, -dir.x, dir.y});
 
 	auto envmap_float = vec2{dir_cyl.y * (envmap_resolution.x-1), dir_cyl.x * (envmap_resolution.y-1)};
@@ -63,7 +58,7 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, GRAD_
 
 	auto weight = envmap_float - vec2(envmap_texel);
 
-	auto deposit_val = [&](const tcnn::vector_t<T, 4>& value, T weight, ivec2 pos) {
+	auto deposit_val = [&](const tvec<T, 4>& value, T weight, ivec2 pos) {
 		if (pos.x < 0) {
 			pos.x += envmap_resolution.x;
 		} else if (pos.x >= envmap_resolution.x) {
@@ -71,8 +66,6 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, GRAD_
 		}
 		pos.y = std::max(std::min(pos.y, envmap_resolution.y-1), 0);
 
-		vec4 result;
-
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 // atomicAdd(__half2) is only supported with compute capability 60 and above
 		if (std::is_same<GRAD_T, __half>::value) {
 			for (uint32_t c = 0; c < 4; c += 2) {
@@ -93,4 +86,4 @@ __device__ void deposit_envmap_gradient(const tcnn::vector_t<T, 4>& value, GRAD_
 	deposit_val(value, (weight.x) * (weight.y), {envmap_texel.x+1, envmap_texel.y+1});
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/json_binding.h b/include/neural-graphics-primitives/json_binding.h
index f55762327..01f55c24e 100644
--- a/include/neural-graphics-primitives/json_binding.h
+++ b/include/neural-graphics-primitives/json_binding.h
@@ -10,7 +10,7 @@
 
 /** @file   json_binding.h
  *  @author Thomas Müller, NVIDIA
- *  @brief  Conversion between eigen
+ *  @brief  Conversion between some ngp types and nlohmann::json.
  */
 
 #pragma once
@@ -18,136 +18,11 @@
 #include <neural-graphics-primitives/common.h>
 #include <neural-graphics-primitives/nerf_loader.h>
 
-#include <json/json.hpp>
-
-// Conversion between glm and json
-namespace glm {
-	template <typename T>
-	void to_json(nlohmann::json& j, const tmat3x3<T>& mat) {
-		for (int row = 0; row < 3; ++row) {
-			nlohmann::json column = nlohmann::json::array();
-			for (int col = 0; col < 3; ++col) {
-				column.push_back(mat[col][row]);
-			}
-			j.push_back(column);
-		}
-	}
-
-	template <typename T>
-	void from_json(const nlohmann::json& j, tmat3x3<T>& mat) {
-		for (std::size_t row = 0; row < 3; ++row) {
-			const auto& jrow = j.at(row);
-			for (std::size_t col = 0; col < 3; ++col) {
-				const auto& value = jrow.at(col);
-				mat[col][row] = value.get<T>();
-			}
-		}
-	}
-
-	template <typename T>
-	void to_json(nlohmann::json& j, const tmat4x3<T>& mat) {
-		for (int row = 0; row < 3; ++row) {
-			nlohmann::json column = nlohmann::json::array();
-			for (int col = 0; col < 4; ++col) {
-				column.push_back(mat[col][row]);
-			}
-			j.push_back(column);
-		}
-	}
-
-	template <typename T>
-	void from_json(const nlohmann::json& j, tmat4x3<T>& mat) {
-		for (std::size_t row = 0; row < 3; ++row) {
-			const auto& jrow = j.at(row);
-			for (std::size_t col = 0; col < 4; ++col) {
-				const auto& value = jrow.at(col);
-				mat[col][row] = value.get<T>();
-			}
-		}
-	}
-
-	template <typename T>
-	void to_json(nlohmann::json& j, const tmat4x4<T>& mat) {
-		for (int row = 0; row < 4; ++row) {
-			nlohmann::json column = nlohmann::json::array();
-			for (int col = 0; col < 4; ++col) {
-				column.push_back(mat[col][row]);
-			}
-			j.push_back(column);
-		}
-	}
-
-	template <typename T>
-	void from_json(const nlohmann::json& j, tmat4x4<T>& mat) {
-		for (std::size_t row = 0; row < 4; ++row) {
-			const auto& jrow = j.at(row);
-			for (std::size_t col = 0; col < 4; ++col) {
-				const auto& value = jrow.at(col);
-				mat[col][row] = value.get<T>();
-			}
-		}
-	}
+#include <tiny-cuda-nn/vec_json.h>
 
-	template <typename T>
-	void to_json(nlohmann::json& j, const tvec2<T>& v) {
-		j.push_back(v.x);
-		j.push_back(v.y);
-	}
-
-	template <typename T>
-	void from_json(const nlohmann::json& j, tvec2<T>& v) {
-		v.x = j.at(0).get<T>();
-		v.y = j.at(1).get<T>();
-	}
-
-	template <typename T>
-	void to_json(nlohmann::json& j, const tvec3<T>& v) {
-		j.push_back(v.x);
-		j.push_back(v.y);
-		j.push_back(v.z);
-	}
-
-	template <typename T>
-	void from_json(const nlohmann::json& j, tvec3<T>& v) {
-		v.x = j.at(0).get<T>();
-		v.y = j.at(1).get<T>();
-		v.z = j.at(2).get<T>();
-	}
-
-	template <typename T>
-	void to_json(nlohmann::json& j, const tvec4<T>& v) {
-		j.push_back(v.x);
-		j.push_back(v.y);
-		j.push_back(v.z);
-		j.push_back(v.w);
-	}
-
-	template <typename T>
-	void from_json(const nlohmann::json& j, tvec4<T>& v) {
-		v.x = j.at(0).get<T>();
-		v.y = j.at(1).get<T>();
-		v.z = j.at(2).get<T>();
-		v.w = j.at(3).get<T>();
-	}
-
-	template <typename T>
-	void to_json(nlohmann::json& j, const tquat<T>& q) {
-		j.push_back(q.x);
-		j.push_back(q.y);
-		j.push_back(q.z);
-		j.push_back(q.w);
-	}
-
-	template <typename T>
-	void from_json(const nlohmann::json& j, tquat<T>& q) {
-		q.x = j.at(0).get<T>();
-		q.y = j.at(1).get<T>();
-		q.z = j.at(2).get<T>();
-		q.w = j.at(3).get<T>();
-	}
-}
+#include <json/json.hpp>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 inline void to_json(nlohmann::json& j, const BoundingBox& box) {
 	j["min"] = box.min;
@@ -287,7 +162,7 @@ inline void from_json(const nlohmann::json& j, NerfDataset& dataset) {
 	}
 
 	dataset.render_aabb = j.at("render_aabb");
-	dataset.render_aabb_to_local = mat3(1.0f);
+	dataset.render_aabb_to_local = mat3::identity();
 	if (j.contains("render_aabb_to_local")) dataset.render_aabb_to_local = j.at("render_aabb_to_local");
 
 	dataset.up = j.at("up");
@@ -307,4 +182,4 @@ inline void from_json(const nlohmann::json& j, NerfDataset& dataset) {
 	dataset.n_extra_learnable_dims = j.value("n_extra_learnable_dims", 0);
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/marching_cubes.h b/include/neural-graphics-primitives/marching_cubes.h
index 5d7ba5092..869b4f2a1 100644
--- a/include/neural-graphics-primitives/marching_cubes.h
+++ b/include/neural-graphics-primitives/marching_cubes.h
@@ -15,37 +15,38 @@
 #pragma once
 
 #include <neural-graphics-primitives/bounding_box.cuh>
+#include <neural-graphics-primitives/common_host.h>
 
 #include <tiny-cuda-nn/common.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 ivec3 get_marching_cubes_res(uint32_t res_1d, const BoundingBox& render_aabb);
 
-void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 render_aabb_to_local, ivec3 res_3d, float thresh, const tcnn::GPUMemory<float>& density, tcnn::GPUMemory<vec3>& vert_out, tcnn::GPUMemory<uint32_t>& indices_out);
+void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 render_aabb_to_local, ivec3 res_3d, float thresh, const GPUMemory<float>& density, GPUMemory<vec3>& vert_out, GPUMemory<uint32_t>& indices_out);
 
 // computes the average of the 1ring of all verts, as homogenous coordinates
-void compute_mesh_1ring(const tcnn::GPUMemory<vec3>& verts, const tcnn::GPUMemory<uint32_t>& indices, tcnn::GPUMemory<vec4>& output_pos, tcnn::GPUMemory<vec3>& output_normals);
+void compute_mesh_1ring(const GPUMemory<vec3>& verts, const GPUMemory<uint32_t>& indices, GPUMemory<vec4>& output_pos, GPUMemory<vec3>& output_normals);
 
 void compute_mesh_opt_gradients(
 	float thresh,
-	const tcnn::GPUMemory<vec3>& verts,
-	const tcnn::GPUMemory<vec3>& vert_normals,
-	const tcnn::GPUMemory<vec4>& verts_smoothed,
-	const tcnn::network_precision_t* densities,
+	const GPUMemory<vec3>& verts,
+	const GPUMemory<vec3>& vert_normals,
+	const GPUMemory<vec4>& verts_smoothed,
+	const network_precision_t* densities,
 	uint32_t input_gradient_width,
 	const float* input_gradients,
-	tcnn::GPUMemory<vec3>& verts_gradient_out,
+	GPUMemory<vec3>& verts_gradient_out,
 	float k_smooth_amount,
 	float k_density_amount,
 	float k_inflate_amount
 );
 
 void save_mesh(
-	tcnn::GPUMemory<vec3>& verts,
-	tcnn::GPUMemory<vec3>& normals,
-	tcnn::GPUMemory<vec3>& colors,
-	tcnn::GPUMemory<uint32_t>& indices,
+	GPUMemory<vec3>& verts,
+	GPUMemory<vec3>& normals,
+	GPUMemory<vec3>& colors,
+	GPUMemory<uint32_t>& indices,
 	const fs::path& path,
 	bool unwrap_it,
 	float nerf_scale,
@@ -54,10 +55,10 @@ void save_mesh(
 
 #ifdef NGP_GUI
 void draw_mesh_gl(
-	const tcnn::GPUMemory<vec3>& verts,
-	const tcnn::GPUMemory<vec3>& normals,
-	const tcnn::GPUMemory<vec3>& cols,
-	const tcnn::GPUMemory<uint32_t>& indices,
+	const GPUMemory<vec3>& verts,
+	const GPUMemory<vec3>& normals,
+	const GPUMemory<vec3>& cols,
+	const GPUMemory<uint32_t>& indices,
 	const ivec2& resolution,
 	const vec2& focal_length,
 	const mat4x3& camera_matrix,
@@ -70,8 +71,8 @@ uint32_t compile_shader(bool pixel, const char* code);
 bool check_shader(uint32_t handle, const char* desc, bool program);
 #endif
 
-void save_density_grid_to_png(const tcnn::GPUMemory<float>& density, const fs::path& path, ivec3 res3d, float thresh, bool swap_y_z = true, float density_range = 4.f);
-void save_rgba_grid_to_png_sequence(const tcnn::GPUMemory<vec4>& rgba, const fs::path& path, ivec3 res3d, bool swap_y_z = true);
-void save_rgba_grid_to_raw_file(const tcnn::GPUMemory<vec4>& rgba, const fs::path& path, ivec3 res3d, bool swap_y_z, int cascade);
+void save_density_grid_to_png(const GPUMemory<float>& density, const fs::path& path, ivec3 res3d, float thresh, bool swap_y_z = true, float density_range = 4.f);
+void save_rgba_grid_to_png_sequence(const GPUMemory<vec4>& rgba, const fs::path& path, ivec3 res3d, bool swap_y_z = true);
+void save_rgba_grid_to_raw_file(const GPUMemory<vec4>& rgba, const fs::path& path, ivec3 res3d, bool swap_y_z, int cascade);
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/nerf.h b/include/neural-graphics-primitives/nerf.h
index 26f826a3a..c8d5d3780 100644
--- a/include/neural-graphics-primitives/nerf.h
+++ b/include/neural-graphics-primitives/nerf.h
@@ -15,32 +15,12 @@
 #pragma once
 
 #include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/nerf_device.cuh>
 
-#include <tiny-cuda-nn/gpu_memory.h>
-
-NGP_NAMESPACE_BEGIN
-
-// size of the density/occupancy grid in number of cells along an axis.
-inline constexpr __device__ uint32_t NERF_GRIDSIZE() {
-	return 128;
-}
-
-inline constexpr __device__ uint32_t NERF_GRID_N_CELLS() {
-	return NERF_GRIDSIZE() * NERF_GRIDSIZE() * NERF_GRIDSIZE();
-}
-
-struct NerfPayload {
-	vec3 origin;
-	vec3 dir;
-	float t;
-	float max_weight;
-	uint32_t idx;
-	uint16_t n_steps;
-	bool alive;
-};
+namespace ngp {
 
 struct RaysNerfSoa {
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+#if defined(__CUDACC__) || (defined(__clang__) && defined(__CUDA__))
 	void copy_from_other_async(const RaysNerfSoa& other, cudaStream_t stream) {
 		CUDA_CHECK_THROW(cudaMemcpyAsync(rgba, other.rgba, size * sizeof(vec4), cudaMemcpyDeviceToDevice, stream));
 		CUDA_CHECK_THROW(cudaMemcpyAsync(depth, other.depth, size * sizeof(float), cudaMemcpyDeviceToDevice, stream));
@@ -61,53 +41,4 @@ struct RaysNerfSoa {
 	size_t size;
 };
 
-//#define TRIPLANAR_COMPATIBLE_POSITIONS   // if this is defined, then positions are stored as [x,y,z,x] so that it can be split as [x,y] [y,z] [z,x] by the input encoding
-
-struct NerfPosition {
-	NGP_HOST_DEVICE NerfPosition(const vec3& pos, float dt)
-	:
-	p{pos}
-#ifdef TRIPLANAR_COMPATIBLE_POSITIONS
-	, x{pos.x}
-#endif
-	{}
-	vec3 p;
-#ifdef TRIPLANAR_COMPATIBLE_POSITIONS
-	float x;
-#endif
-};
-
-struct NerfDirection {
-	NGP_HOST_DEVICE NerfDirection(const vec3& dir, float dt) : d{dir} {}
-	vec3 d;
-};
-
-struct NerfCoordinate {
-	NGP_HOST_DEVICE NerfCoordinate(const vec3& pos, const vec3& dir, float dt) : pos{pos, dt}, dt{dt}, dir{dir, dt} {}
-	NGP_HOST_DEVICE void set_with_optional_extra_dims(const vec3& pos, const vec3& dir, float dt, const float* extra_dims, uint32_t stride_in_bytes) {
-		this->dt = dt;
-		this->pos = NerfPosition(pos, dt);
-		this->dir = NerfDirection(dir, dt);
-		copy_extra_dims(extra_dims, stride_in_bytes);
-	}
-	inline NGP_HOST_DEVICE const float* get_extra_dims() const { return (const float*)(this + 1); }
-	inline NGP_HOST_DEVICE float* get_extra_dims() { return (float*)(this + 1); }
-
-	NGP_HOST_DEVICE void copy(const NerfCoordinate& inp, uint32_t stride_in_bytes) {
-		*this = inp;
-		copy_extra_dims(inp.get_extra_dims(), stride_in_bytes);
-	}
-	NGP_HOST_DEVICE inline void copy_extra_dims(const float *extra_dims, uint32_t stride_in_bytes) {
-		if (stride_in_bytes >= sizeof(NerfCoordinate)) {
-			float* dst = get_extra_dims();
-			const uint32_t n_extra = (stride_in_bytes - sizeof(NerfCoordinate)) / sizeof(float);
-			for (uint32_t i = 0; i < n_extra; ++i) dst[i] = extra_dims[i];
-		}
-	}
-
-	NerfPosition pos;
-	float dt;
-	NerfDirection dir;
-};
-
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/nerf_device.cuh b/include/neural-graphics-primitives/nerf_device.cuh
new file mode 100644
index 000000000..61cce7479
--- /dev/null
+++ b/include/neural-graphics-primitives/nerf_device.cuh
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+
+/** @file   nerf_device.cuh
+ *  @author Thomas Müller & Alex Evans, NVIDIA
+ */
+
+#pragma once
+
+#include <neural-graphics-primitives/common_device.cuh>
+
+#include <tiny-cuda-nn/common_device.h>
+
+namespace ngp {
+
+// size of the density/occupancy grid in number of cells along an axis.
+inline constexpr __device__ uint32_t NERF_GRIDSIZE() { return 128; }
+inline constexpr __device__ uint32_t NERF_GRID_N_CELLS() { return NERF_GRIDSIZE() * NERF_GRIDSIZE() * NERF_GRIDSIZE(); }
+
+inline constexpr __device__ float NERF_RENDERING_NEAR_DISTANCE() { return 0.05f; }
+inline constexpr __device__ uint32_t NERF_STEPS() { return 1024; } // finest number of steps per unit length
+inline constexpr __device__ uint32_t NERF_CASCADES() { return 8; }
+
+inline constexpr __device__ float SQRT3() { return 1.73205080757f; }
+inline constexpr __device__ float STEPSIZE() { return (SQRT3() / NERF_STEPS()); } // for nerf raymarch
+inline constexpr __device__ float MIN_CONE_STEPSIZE() { return STEPSIZE(); }
+// Maximum step size is the width of the coarsest gridsize cell.
+inline constexpr __device__ float MAX_CONE_STEPSIZE() { return STEPSIZE() * (1<<(NERF_CASCADES()-1)) * NERF_STEPS() / NERF_GRIDSIZE(); }
+
+// Used to index into the PRNG stream. Must be larger than the number of
+// samples consumed by any given training ray.
+inline constexpr __device__ uint32_t N_MAX_RANDOM_SAMPLES_PER_RAY() { return 16; }
+
+// Any alpha below this is considered "invisible" and is thus culled away.
+inline constexpr __device__ float NERF_MIN_OPTICAL_THICKNESS() { return 0.01f; }
+
+struct TrainingImageMetadata {
+	// Camera intrinsics and additional data associated with a NeRF training image
+	// the memory to back the pixels and rays is held by GPUMemory objects in the NerfDataset and copied here.
+	const void* pixels = nullptr;
+	EImageDataType image_data_type = EImageDataType::Half;
+
+	const float* depth = nullptr;
+	const Ray* rays = nullptr;
+
+	Lens lens = {};
+	ivec2 resolution = ivec2(0);
+	vec2 principal_point = vec2(0.5f);
+	vec2 focal_length = vec2(1000.f);
+	vec4 rolling_shutter = vec4(0.0f);
+	vec3 light_dir = vec3(0.f); // TODO: replace this with more generic float[] of task-specific metadata.
+};
+
+struct LossAndGradient {
+	vec3 loss;
+	vec3 gradient;
+
+	NGP_HOST_DEVICE LossAndGradient operator*(float scalar) {
+		return {loss * scalar, gradient * scalar};
+	}
+
+	NGP_HOST_DEVICE LossAndGradient operator/(float scalar) {
+		return {loss / scalar, gradient / scalar};
+	}
+};
+
+inline NGP_HOST_DEVICE LossAndGradient l2_loss(const vec3& target, const vec3& prediction) {
+	vec3 difference = prediction - target;
+	return {
+		difference * difference,
+		2.0f * difference
+	};
+}
+
+inline NGP_HOST_DEVICE LossAndGradient relative_l2_loss(const vec3& target, const vec3& prediction) {
+	vec3 difference = prediction - target;
+	vec3 denom = prediction * prediction + 1e-2f;
+	return {
+		difference * difference / denom,
+		2.0f * difference / denom
+	};
+}
+
+inline NGP_HOST_DEVICE LossAndGradient l1_loss(const vec3& target, const vec3& prediction) {
+	vec3 difference = prediction - target;
+	return {
+		abs(difference),
+		copysign(vec3(1.0f), difference),
+	};
+}
+
+inline NGP_HOST_DEVICE LossAndGradient huber_loss(const vec3& target, const vec3& prediction, float alpha = 1) {
+	vec3 difference = prediction - target;
+	vec3 abs_diff = abs(difference);
+	vec3 square = 0.5f/alpha * difference * difference;
+	return {
+		{
+			abs_diff.x > alpha ? (abs_diff.x - 0.5f * alpha) : square.x,
+			abs_diff.y > alpha ? (abs_diff.y - 0.5f * alpha) : square.y,
+			abs_diff.z > alpha ? (abs_diff.z - 0.5f * alpha) : square.z,
+		},
+		{
+			abs_diff.x > alpha ? (difference.x > 0 ? 1.0f : -1.0f) : (difference.x / alpha),
+			abs_diff.y > alpha ? (difference.y > 0 ? 1.0f : -1.0f) : (difference.y / alpha),
+			abs_diff.z > alpha ? (difference.z > 0 ? 1.0f : -1.0f) : (difference.z / alpha),
+		},
+	};
+}
+
+inline NGP_HOST_DEVICE LossAndGradient log_l1_loss(const vec3& target, const vec3& prediction) {
+	vec3 difference = prediction - target;
+	vec3 divisor = abs(difference) + 1.0f;
+	return {
+		log(divisor),
+		copysign(vec3(1.0f) / divisor, difference),
+	};
+}
+
+inline NGP_HOST_DEVICE LossAndGradient smape_loss(const vec3& target, const vec3& prediction) {
+	vec3 difference = prediction - target;
+	vec3 denom = 0.5f * (abs(prediction) + abs(target)) + 1e-2f;
+	return {
+		abs(difference) / denom,
+		copysign(vec3(1.0f) / denom, difference),
+	};
+}
+
+inline NGP_HOST_DEVICE LossAndGradient mape_loss(const vec3& target, const vec3& prediction) {
+	vec3 difference = prediction - target;
+	vec3 denom = abs(prediction) + 1e-2f;
+	return {
+		abs(difference) / denom,
+		copysign(vec3(1.0f) / denom, difference),
+	};
+}
+
+struct NerfPayload {
+	vec3 origin;
+	vec3 dir;
+	float t;
+	float max_weight;
+	uint32_t idx;
+	uint16_t n_steps;
+	bool alive;
+};
+
+//#define TRIPLANAR_COMPATIBLE_POSITIONS   // if this is defined, then positions are stored as [x,y,z,x] so that it can be split as [x,y] [y,z] [z,x] by the input encoding
+
+struct NerfPosition {
+	NGP_HOST_DEVICE NerfPosition(const vec3& pos, float dt)
+	:
+	p{pos}
+#ifdef TRIPLANAR_COMPATIBLE_POSITIONS
+	, x{pos.x}
+#endif
+	{}
+	vec3 p;
+#ifdef TRIPLANAR_COMPATIBLE_POSITIONS
+	float x;
+#endif
+};
+
+struct NerfDirection {
+	NGP_HOST_DEVICE NerfDirection(const vec3& dir, float dt) : d{dir} {}
+	vec3 d;
+};
+
+struct NerfCoordinate {
+	NGP_HOST_DEVICE NerfCoordinate(const vec3& pos, const vec3& dir, float dt) : pos{pos, dt}, dt{dt}, dir{dir, dt} {}
+	NGP_HOST_DEVICE void set_with_optional_extra_dims(const vec3& pos, const vec3& dir, float dt, const float* extra_dims, uint32_t stride_in_bytes) {
+		this->dt = dt;
+		this->pos = NerfPosition(pos, dt);
+		this->dir = NerfDirection(dir, dt);
+		copy_extra_dims(extra_dims, stride_in_bytes);
+	}
+	inline NGP_HOST_DEVICE const float* get_extra_dims() const { return (const float*)(this + 1); }
+	inline NGP_HOST_DEVICE float* get_extra_dims() { return (float*)(this + 1); }
+
+	NGP_HOST_DEVICE void copy(const NerfCoordinate& inp, uint32_t stride_in_bytes) {
+		*this = inp;
+		copy_extra_dims(inp.get_extra_dims(), stride_in_bytes);
+	}
+	NGP_HOST_DEVICE inline void copy_extra_dims(const float *extra_dims, uint32_t stride_in_bytes) {
+		if (stride_in_bytes >= sizeof(NerfCoordinate)) {
+			float* dst = get_extra_dims();
+			const uint32_t n_extra = (stride_in_bytes - sizeof(NerfCoordinate)) / sizeof(float);
+			for (uint32_t i = 0; i < n_extra; ++i) dst[i] = extra_dims[i];
+		}
+	}
+
+	NerfPosition pos;
+	float dt;
+	NerfDirection dir;
+};
+
+inline NGP_HOST_DEVICE float network_to_rgb(float val, ENerfActivation activation) {
+	switch (activation) {
+		case ENerfActivation::None: return val;
+		case ENerfActivation::ReLU: return val > 0.0f ? val : 0.0f;
+		case ENerfActivation::Logistic: return logistic(val);
+		case ENerfActivation::Exponential: return expf(clamp(val, -10.0f, 10.0f));
+		default: assert(false);
+	}
+	return 0.0f;
+}
+
+inline NGP_HOST_DEVICE float network_to_rgb_derivative(float val, ENerfActivation activation) {
+	switch (activation) {
+		case ENerfActivation::None: return 1.0f;
+		case ENerfActivation::ReLU: return val > 0.0f ? 1.0f : 0.0f;
+		case ENerfActivation::Logistic: { float density = logistic(val); return density * (1 - density); };
+		case ENerfActivation::Exponential: return expf(clamp(val, -10.0f, 10.0f));
+		default: assert(false);
+	}
+	return 0.0f;
+}
+
+template <typename T>
+NGP_HOST_DEVICE vec3 network_to_rgb_derivative_vec(const T& val, ENerfActivation activation) {
+	return {
+		network_to_rgb_derivative(float(val[0]), activation),
+		network_to_rgb_derivative(float(val[1]), activation),
+		network_to_rgb_derivative(float(val[2]), activation),
+	};
+}
+
+inline NGP_HOST_DEVICE float network_to_density(float val, ENerfActivation activation) {
+	switch (activation) {
+		case ENerfActivation::None: return val;
+		case ENerfActivation::ReLU: return val > 0.0f ? val : 0.0f;
+		case ENerfActivation::Logistic: return logistic(val);
+		case ENerfActivation::Exponential: return expf(val);
+		default: assert(false);
+	}
+	return 0.0f;
+}
+
+inline NGP_HOST_DEVICE float network_to_density_derivative(float val, ENerfActivation activation) {
+	switch (activation) {
+		case ENerfActivation::None: return 1.0f;
+		case ENerfActivation::ReLU: return val > 0.0f ? 1.0f : 0.0f;
+		case ENerfActivation::Logistic: { float density = logistic(val); return density * (1 - density); };
+		case ENerfActivation::Exponential: return expf(clamp(val, -15.0f, 15.0f));
+		default: assert(false);
+	}
+	return 0.0f;
+}
+
+template <typename T>
+NGP_HOST_DEVICE vec3 network_to_rgb_vec(const T& val, ENerfActivation activation) {
+	return {
+		network_to_rgb(float(val[0]), activation),
+		network_to_rgb(float(val[1]), activation),
+		network_to_rgb(float(val[2]), activation),
+	};
+}
+
+inline NGP_HOST_DEVICE vec3 warp_position(const vec3& pos, const BoundingBox& aabb) {
+	// return {logistic(pos.x - 0.5f), logistic(pos.y - 0.5f), logistic(pos.z - 0.5f)};
+	// return pos;
+
+	return aabb.relative_pos(pos);
+}
+
+inline NGP_HOST_DEVICE vec3 unwarp_position(const vec3& pos, const BoundingBox& aabb) {
+	// return {logit(pos.x) + 0.5f, logit(pos.y) + 0.5f, logit(pos.z) + 0.5f};
+	// return pos;
+
+	return aabb.min + pos * aabb.diag();
+}
+
+inline NGP_HOST_DEVICE vec3 unwarp_position_derivative(const vec3& pos, const BoundingBox& aabb) {
+	// return {logit(pos.x) + 0.5f, logit(pos.y) + 0.5f, logit(pos.z) + 0.5f};
+	// return pos;
+
+	return aabb.diag();
+}
+
+inline NGP_HOST_DEVICE vec3 warp_position_derivative(const vec3& pos, const BoundingBox& aabb) {
+	return vec3(1.0f) / unwarp_position_derivative(pos, aabb);
+}
+
+inline NGP_HOST_DEVICE vec3 warp_direction(const vec3& dir) {
+	return (dir + 1.0f) * 0.5f;
+}
+
+inline NGP_HOST_DEVICE vec3 unwarp_direction(const vec3& dir) {
+	return dir * 2.0f - 1.0f;
+}
+
+inline NGP_HOST_DEVICE vec3 warp_direction_derivative(const vec3& dir) {
+	return vec3(0.5f);
+}
+
+inline NGP_HOST_DEVICE vec3 unwarp_direction_derivative(const vec3& dir) {
+	return vec3(2.0f);
+}
+
+inline NGP_HOST_DEVICE float warp_dt(float dt) {
+	float max_stepsize = MIN_CONE_STEPSIZE() * (1<<(NERF_CASCADES()-1));
+	return (dt - MIN_CONE_STEPSIZE()) / (max_stepsize - MIN_CONE_STEPSIZE());
+}
+
+inline NGP_HOST_DEVICE float unwarp_dt(float dt) {
+	float max_stepsize = MIN_CONE_STEPSIZE() * (1<<(NERF_CASCADES()-1));
+	return dt * (max_stepsize - MIN_CONE_STEPSIZE()) + MIN_CONE_STEPSIZE();
+}
+
+inline NGP_HOST_DEVICE uint32_t cascaded_grid_idx_at(vec3 pos, uint32_t mip) {
+	float mip_scale = scalbnf(1.0f, -mip);
+	pos -= vec3(0.5f);
+	pos *= mip_scale;
+	pos += vec3(0.5f);
+
+	ivec3 i = pos * (float)NERF_GRIDSIZE();
+	if (i.x < 0 || i.x >= NERF_GRIDSIZE() || i.y < 0 || i.y >= NERF_GRIDSIZE() || i.z < 0 || i.z >= NERF_GRIDSIZE()) {
+		return 0xFFFFFFFF;
+	}
+
+	return morton3D(i.x, i.y, i.z);
+}
+
+inline NGP_HOST_DEVICE uint32_t grid_mip_offset(uint32_t mip) {
+	return NERF_GRID_N_CELLS() * mip;
+}
+
+inline NGP_HOST_DEVICE bool density_grid_occupied_at(const vec3& pos, const uint8_t* density_grid_bitfield, uint32_t mip) {
+	uint32_t idx = cascaded_grid_idx_at(pos, mip);
+	if (idx == 0xFFFFFFFF) {
+		return false;
+	}
+	return density_grid_bitfield[idx/8+grid_mip_offset(mip)/8] & (1<<(idx%8));
+}
+
+inline NGP_HOST_DEVICE float cascaded_grid_at(vec3 pos, const float* cascaded_grid, uint32_t mip) {
+	uint32_t idx = cascaded_grid_idx_at(pos, mip);
+	if (idx == 0xFFFFFFFF) {
+		return 0.0f;
+	}
+	return cascaded_grid[idx+grid_mip_offset(mip)];
+}
+
+inline NGP_HOST_DEVICE float& cascaded_grid_at(vec3 pos, float* cascaded_grid, uint32_t mip) {
+	uint32_t idx = cascaded_grid_idx_at(pos, mip);
+	if (idx == 0xFFFFFFFF) {
+		idx = 0;
+		printf("WARNING: invalid cascaded grid access.");
+	}
+	return cascaded_grid[idx+grid_mip_offset(mip)];
+}
+
+inline NGP_HOST_DEVICE float distance_to_next_voxel(const vec3& pos, const vec3& dir, const vec3& idir, float res) { // dda like step
+	vec3 p = res * (pos - 0.5f);
+	float tx = (floorf(p.x + 0.5f + 0.5f * sign(dir.x)) - p.x) * idir.x;
+	float ty = (floorf(p.y + 0.5f + 0.5f * sign(dir.y)) - p.y) * idir.y;
+	float tz = (floorf(p.z + 0.5f + 0.5f * sign(dir.z)) - p.z) * idir.z;
+	float t = min(min(tx, ty), tz);
+
+	return fmaxf(t / res, 0.0f);
+}
+
+inline NGP_HOST_DEVICE float calc_cone_angle(float cosine, const vec2& focal_length, float cone_angle_constant) {
+	// Pixel size. Doesn't always yield a good performance vs. quality
+	// trade off. Especially if training pixels have a much different
+	// size than rendering pixels.
+	// return cosine*cosine / focal_length.mean();
+
+	return cone_angle_constant;
+}
+
+inline NGP_HOST_DEVICE float to_stepping_space(float t, float cone_angle) {
+	if (cone_angle <= 1e-5f) {
+		return t / MIN_CONE_STEPSIZE();
+	}
+
+	float log1p_c = logf(1.0f + cone_angle);
+
+	float a = (logf(MIN_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c;
+	float b = (logf(MAX_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c;
+
+	float at = expf(a * log1p_c);
+	float bt = expf(b * log1p_c);
+
+	if (t <= at) {
+		return (t - at) / MIN_CONE_STEPSIZE() + a;
+	} else if (t <= bt) {
+		return logf(t) / log1p_c;
+	} else {
+		return (t - bt) / MAX_CONE_STEPSIZE() + b;
+	}
+}
+
+inline NGP_HOST_DEVICE float from_stepping_space(float n, float cone_angle) {
+	if (cone_angle <= 1e-5f) {
+		return n * MIN_CONE_STEPSIZE();
+	}
+
+	float log1p_c = logf(1.0f + cone_angle);
+
+	float a = (logf(MIN_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c;
+	float b = (logf(MAX_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c;
+
+	float at = expf(a * log1p_c);
+	float bt = expf(b * log1p_c);
+
+	if (n <= a) {
+		return (n - a) * MIN_CONE_STEPSIZE() + at;
+	} else if (n <= b) {
+		return expf(n * log1p_c);
+	} else {
+		return (n - b) * MAX_CONE_STEPSIZE() + bt;
+	}
+}
+
+inline NGP_HOST_DEVICE float advance_n_steps(float t, float cone_angle, float n) {
+	return from_stepping_space(to_stepping_space(t, cone_angle) + n, cone_angle);
+}
+
+inline NGP_HOST_DEVICE float calc_dt(float t, float cone_angle) {
+	return advance_n_steps(t, cone_angle, 1.0f) - t;
+}
+
+inline NGP_HOST_DEVICE float advance_to_next_voxel(float t, float cone_angle, const vec3& pos, const vec3& dir, const vec3& idir, uint32_t mip) {
+	float res = scalbnf(NERF_GRIDSIZE(), -(int)mip);
+
+	float t_target = t + distance_to_next_voxel(pos, dir, idir, res);
+
+	// Analytic stepping in multiples of 1 in the "log-space" of our exponential stepping routine
+	t = to_stepping_space(t, cone_angle);
+	t_target = to_stepping_space(t_target, cone_angle);
+
+	return from_stepping_space(t + ceilf(fmaxf(t_target - t, 0.5f)), cone_angle);
+}
+
+inline NGP_HOST_DEVICE uint32_t mip_from_pos(const vec3& pos, uint32_t max_cascade = NERF_CASCADES()-1) {
+	int exponent;
+	float maxval = max(abs(pos - 0.5f));
+	frexpf(maxval, &exponent);
+	return (uint32_t)clamp(exponent+1, 0, (int)max_cascade);
+}
+
+inline NGP_HOST_DEVICE uint32_t mip_from_dt(float dt, const vec3& pos, uint32_t max_cascade = NERF_CASCADES()-1) {
+	uint32_t mip = mip_from_pos(pos, max_cascade);
+	dt *= 2 * NERF_GRIDSIZE();
+	if (dt < 1.0f) {
+		return mip;
+	}
+
+	int exponent;
+	frexpf(dt, &exponent);
+	return (uint32_t)clamp((int)mip, exponent, (int)max_cascade);
+}
+
+template <bool MIP_FROM_DT=false>
+NGP_HOST_DEVICE float if_unoccupied_advance_to_next_occupied_voxel(
+	float t,
+	float cone_angle,
+	const Ray& ray,
+	const vec3& idir,
+	const uint8_t* __restrict__ density_grid,
+	uint32_t min_mip,
+	uint32_t max_mip,
+	BoundingBox aabb,
+	mat3 aabb_to_local = mat3::identity()
+) {
+	while (true) {
+		vec3 pos = ray(t);
+		if (t >= MAX_DEPTH() || !aabb.contains(aabb_to_local * pos)) {
+			return MAX_DEPTH();
+		}
+
+		uint32_t mip = clamp(MIP_FROM_DT ? mip_from_dt(calc_dt(t, cone_angle), pos) : mip_from_pos(pos), min_mip, max_mip);
+
+		if (!density_grid || density_grid_occupied_at(pos, density_grid, mip)) {
+			return t;
+		}
+
+		// Find largest empty voxel surrounding us, such that we can advance as far as possible in the next step.
+		// Other places that do voxel stepping don't need this, because they don't rely on thread coherence as
+		// much as this one here.
+		while (mip < max_mip && !density_grid_occupied_at(pos, density_grid, mip+1)) {
+			++mip;
+		}
+
+		t = advance_to_next_voxel(t, cone_angle, pos, ray.d, idir, mip);
+	}
+}
+
+static constexpr float UNIFORM_SAMPLING_FRACTION = 0.5f;
+
+inline NGP_HOST_DEVICE vec2 sample_cdf_2d(vec2 sample, uint32_t img, const ivec2& res, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y, float* __restrict__ pdf) {
+	if (sample.x < UNIFORM_SAMPLING_FRACTION) {
+		sample.x /= UNIFORM_SAMPLING_FRACTION;
+		return sample;
+	}
+
+	sample.x = (sample.x - UNIFORM_SAMPLING_FRACTION) / (1.0f - UNIFORM_SAMPLING_FRACTION);
+
+	cdf_y += img * res.y;
+
+	// First select row according to cdf_y
+	uint32_t y = binary_search(sample.y, cdf_y, res.y);
+	float prev = y > 0 ? cdf_y[y-1] : 0.0f;
+	float pmf_y = cdf_y[y] - prev;
+	sample.y = (sample.y - prev) / pmf_y;
+
+	cdf_x_cond_y += img * res.y * res.x + y * res.x;
+
+	// Then, select col according to x
+	uint32_t x = binary_search(sample.x, cdf_x_cond_y, res.x);
+	prev = x > 0 ? cdf_x_cond_y[x-1] : 0.0f;
+	float pmf_x = cdf_x_cond_y[x] - prev;
+	sample.x = (sample.x - prev) / pmf_x;
+
+	if (pdf) {
+		*pdf = pmf_x * pmf_y * product(res);
+	}
+
+	return {((float)x + sample.x) / (float)res.x, ((float)y + sample.y) / (float)res.y};
+}
+
+inline NGP_HOST_DEVICE float pdf_2d(vec2 sample, uint32_t img, const ivec2& res, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y) {
+	ivec2 p = clamp(ivec2(sample * vec2(res)), 0, res - 1);
+
+	cdf_y += img * res.y;
+	cdf_x_cond_y += img * res.y * res.x + p.y * res.x;
+
+	float pmf_y = cdf_y[p.y];
+	if (p.y > 0) {
+		pmf_y -= cdf_y[p.y-1];
+	}
+
+	float pmf_x = cdf_x_cond_y[p.x];
+	if (p.x > 0) {
+		pmf_x -= cdf_x_cond_y[p.x-1];
+	}
+
+	// Probability mass of picking the pixel
+	float pmf = pmf_x * pmf_y;
+
+	// To convert to probability density, divide by area of pixel
+	return UNIFORM_SAMPLING_FRACTION + pmf * product(res) * (1.0f - UNIFORM_SAMPLING_FRACTION);
+}
+
+inline __device__ vec2 nerf_random_image_pos_training(default_rng_t& rng, const ivec2& resolution, bool snap_to_pixel_centers, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y, const ivec2& cdf_res, uint32_t img, float* __restrict__ pdf = nullptr) {
+	vec2 uv = random_val_2d(rng);
+
+	if (cdf_x_cond_y) {
+		uv = sample_cdf_2d(uv, img, cdf_res, cdf_x_cond_y, cdf_y, pdf);
+	} else {
+		// // Warp-coherent tile
+		// uv.x = __shfl_sync(0xFFFFFFFF, uv.x, 0);
+		// uv.y = __shfl_sync(0xFFFFFFFF, uv.y, 0);
+
+		// const ivec2 TILE_SIZE = {8, 4};
+		// uv = (uv * vec2(resolution - TILE_SIZE) + vec2(tcnn::lane_id() % TILE_SIZE.x, tcnn::lane_id() / threadIdx.x)) / vec2(resolution);
+
+		if (pdf) {
+			*pdf = 1.0f;
+		}
+	}
+
+	if (snap_to_pixel_centers) {
+		uv = (vec2(clamp(ivec2(uv * vec2(resolution)), 0, resolution - 1)) + 0.5f) / vec2(resolution);
+	}
+
+	return uv;
+}
+
+inline NGP_HOST_DEVICE uint32_t image_idx(uint32_t base_idx, uint32_t n_rays, uint32_t n_rays_total, uint32_t n_training_images, const float* __restrict__ cdf = nullptr, float* __restrict__ pdf = nullptr) {
+	if (cdf) {
+		float sample = ld_random_val(base_idx/* + n_rays_total*/, 0xdeadbeef);
+		// float sample = random_val(base_idx/* + n_rays_total*/);
+		uint32_t img = binary_search(sample, cdf, n_training_images);
+
+		if (pdf) {
+			float prev = img > 0 ? cdf[img-1] : 0.0f;
+			*pdf = (cdf[img] - prev) * n_training_images;
+		}
+
+		return img;
+	}
+
+	// return ((base_idx/* + n_rays_total*/) * 56924617 + 96925573) % n_training_images;
+
+	// Neighboring threads in the warp process the same image. Increases locality.
+	if (pdf) {
+		*pdf = 1.0f;
+	}
+	return (((base_idx/* + n_rays_total*/) * n_training_images) / n_rays) % n_training_images;
+}
+
+inline NGP_HOST_DEVICE LossAndGradient loss_and_gradient(const vec3& target, const vec3& prediction, ELossType loss_type) {
+	switch (loss_type) {
+		case ELossType::RelativeL2:  return relative_l2_loss(target, prediction); break;
+		case ELossType::L1:          return l1_loss(target, prediction); break;
+		case ELossType::Mape:        return mape_loss(target, prediction); break;
+		case ELossType::Smape:       return smape_loss(target, prediction); break;
+		// Note: we divide the huber loss by a factor of 5 such that its L2 region near zero
+		// matches with the L2 loss and error numbers become more comparable. This allows reading
+		// off dB numbers of ~converged models and treating them as approximate PSNR to compare
+		// with other NeRF methods. Self-normalizing optimizers such as Adam are agnostic to such
+		// constant factors; optimization is therefore unaffected.
+		case ELossType::Huber:       return huber_loss(target, prediction, 0.1f) / 5.0f; break;
+		case ELossType::LogL1:       return log_l1_loss(target, prediction); break;
+		default: case ELossType::L2: return l2_loss(target, prediction); break;
+	}
+}
+
+}
diff --git a/include/neural-graphics-primitives/nerf_loader.h b/include/neural-graphics-primitives/nerf_loader.h
index 6456cc3f8..56f2f2f26 100644
--- a/include/neural-graphics-primitives/nerf_loader.h
+++ b/include/neural-graphics-primitives/nerf_loader.h
@@ -16,34 +16,18 @@
 #pragma once
 
 #include <neural-graphics-primitives/bounding_box.cuh>
-#include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/common_host.h>
+#include <neural-graphics-primitives/nerf_device.cuh>
 
 #include <filesystem/path.h>
 
 #include <vector>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 // how much to scale the scene by vs the original nerf dataset; we want to fit the thing in the unit cube
 static constexpr float NERF_SCALE = 0.33f;
 
-struct TrainingImageMetadata {
-	// Camera intrinsics and additional data associated with a NeRF training image
-	// the memory to back the pixels and rays is held by GPUMemory objects in the NerfDataset and copied here.
-	const void* pixels = nullptr;
-	EImageDataType image_data_type = EImageDataType::Half;
-
-	const float* depth = nullptr;
-	const Ray* rays = nullptr;
-
-	Lens lens = {};
-	ivec2 resolution = ivec2(0);
-	vec2 principal_point = vec2(0.5f);
-	vec2 focal_length = vec2(1000.f);
-	vec4 rolling_shutter = vec4(0.0f);
-	vec3 light_dir = vec3(0.f); // TODO: replace this with more generic float[] of task-specific metadata.
-};
-
 inline size_t image_type_size(EImageDataType type) {
 	switch (type) {
 		case EImageDataType::None: return 0;
@@ -67,23 +51,23 @@ struct NerfDataset {
 		return xforms == other.xforms && paths == other.paths;
 	}
 
-	std::vector<tcnn::GPUMemory<Ray>> raymemory;
-	std::vector<tcnn::GPUMemory<uint8_t>> pixelmemory;
-	std::vector<tcnn::GPUMemory<float>> depthmemory;
+	std::vector<GPUMemory<Ray>> raymemory;
+	std::vector<GPUMemory<uint8_t>> pixelmemory;
+	std::vector<GPUMemory<float>> depthmemory;
 
 	std::vector<TrainingImageMetadata> metadata;
-	tcnn::GPUMemory<TrainingImageMetadata> metadata_gpu;
+	GPUMemory<TrainingImageMetadata> metadata_gpu;
 
 	void update_metadata(int first = 0, int last = -1);
 
 	std::vector<TrainingXForm> xforms;
 	std::vector<std::string> paths;
-	tcnn::GPUMemory<float> sharpness_data;
+	GPUMemory<float> sharpness_data;
 	ivec2 sharpness_resolution = {0, 0};
-	tcnn::GPUMemory<float> envmap_data;
+	GPUMemory<float> envmap_data;
 
 	BoundingBox render_aabb = {};
-	mat3 render_aabb_to_local = mat3(1.0f);
+	mat3 render_aabb_to_local = mat3::identity();
 	vec3 up = {0.0f, 1.0f, 0.0f};
 	vec3 offset = {0.0f, 0.0f, 0.0f};
 	size_t n_images = 0;
@@ -107,9 +91,9 @@ struct NerfDataset {
 	vec3 nerf_direction_to_ngp(const vec3& nerf_dir) {
 		vec3 result = nerf_dir;
 		if (from_mitsuba) {
-			result *= -1;
+			result *= -1.0f;
 		} else {
-			result = vec3(result.y, result.z, result.x);
+			result = vec3{result.y, result.z, result.x};
 		}
 		return result;
 	}
@@ -122,8 +106,8 @@ struct NerfDataset {
 		result[3] = result[3] * scale + offset;
 
 		if (from_mitsuba) {
-			result[0] *= -1;
-			result[2] *= -1;
+			result[0] *= -1.0f;
+			result[2] *= -1.0f;
 		} else {
 			// Cycle axes xyz<-yzx
 			vec4 tmp = row(result, 0);
@@ -138,8 +122,8 @@ struct NerfDataset {
 	mat4x3 ngp_matrix_to_nerf(const mat4x3& ngp_matrix, bool scale_columns = false) const {
 		mat4x3 result = ngp_matrix;
 		if (from_mitsuba) {
-			result[0] *= -1;
-			result[2] *= -1;
+			result[0] *= -1.0f;
+			result[2] *= -1.0f;
 		} else {
 			// Cycle axes xyz->yzx
 			vec4 tmp = row(result, 0);
@@ -156,14 +140,14 @@ struct NerfDataset {
 
 	vec3 ngp_position_to_nerf(vec3 pos) const {
 		if (!from_mitsuba) {
-			pos = vec3(pos.z, pos.x, pos.y);
+			pos = vec3{pos.z, pos.x, pos.y};
 		}
 		return (pos - offset) / scale;
 	}
 
 	vec3 nerf_position_to_ngp(const vec3 &pos) const {
 		vec3 rv = pos * scale + offset;
-		return from_mitsuba ? rv : vec3(rv.y, rv.z, rv.x);
+		return from_mitsuba ? rv : vec3{rv.y, rv.z, rv.x};
 	}
 
 	void nerf_ray_to_ngp(Ray& ray, bool scale_direction = false) {
@@ -187,4 +171,4 @@ struct NerfDataset {
 NerfDataset load_nerf(const std::vector<fs::path>& jsonpaths, float sharpen_amount = 0.f);
 NerfDataset create_empty_nerf_dataset(size_t n_images, int aabb_scale = 1, bool is_hdr = false);
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/nerf_network.h b/include/neural-graphics-primitives/nerf_network.h
index 55b513056..3dbf67321 100644
--- a/include/neural-graphics-primitives/nerf_network.h
+++ b/include/neural-graphics-primitives/nerf_network.h
@@ -26,7 +26,7 @@
 
 #include <tiny-cuda-nn/network_with_input_encoding.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 template <typename T>
 __global__ void extract_density(
@@ -74,39 +74,41 @@ __global__ void add_density_gradient(
 }
 
 template <typename T>
-class NerfNetwork : public tcnn::Network<float, T> {
+class NerfNetwork : public Network<float, T> {
 public:
 	using json = nlohmann::json;
 
 	NerfNetwork(uint32_t n_pos_dims, uint32_t n_dir_dims, uint32_t n_extra_dims, uint32_t dir_offset, const json& pos_encoding, const json& dir_encoding, const json& density_network, const json& rgb_network) : m_n_pos_dims{n_pos_dims}, m_n_dir_dims{n_dir_dims}, m_dir_offset{dir_offset}, m_n_extra_dims{n_extra_dims} {
-		m_pos_encoding.reset(tcnn::create_encoding<T>(n_pos_dims, pos_encoding, density_network.contains("otype") && (tcnn::equals_case_insensitive(density_network["otype"], "FullyFusedMLP") || tcnn::equals_case_insensitive(density_network["otype"], "MegakernelMLP")) ? 16u : 8u));
-		uint32_t rgb_alignment = tcnn::minimum_alignment(rgb_network);
-		m_dir_encoding.reset(tcnn::create_encoding<T>(m_n_dir_dims + m_n_extra_dims, dir_encoding, rgb_alignment));
+		m_pos_encoding.reset(create_encoding<T>(n_pos_dims, pos_encoding, density_network.contains("otype") && (equals_case_insensitive(density_network["otype"], "FullyFusedMLP") || equals_case_insensitive(density_network["otype"], "MegakernelMLP")) ? 16u : 8u));
+		uint32_t rgb_alignment = minimum_alignment(rgb_network);
+		m_dir_encoding.reset(create_encoding<T>(m_n_dir_dims + m_n_extra_dims, dir_encoding, rgb_alignment));
 
 		json local_density_network_config = density_network;
 		local_density_network_config["n_input_dims"] = m_pos_encoding->padded_output_width();
 		if (!density_network.contains("n_output_dims")) {
 			local_density_network_config["n_output_dims"] = 16;
 		}
-		m_density_network.reset(tcnn::create_network<T>(local_density_network_config));
+		m_density_network.reset(create_network<T>(local_density_network_config));
 
-		m_rgb_network_input_width = tcnn::next_multiple(m_dir_encoding->padded_output_width() + m_density_network->padded_output_width(), rgb_alignment);
+		m_rgb_network_input_width = next_multiple(m_dir_encoding->padded_output_width() + m_density_network->padded_output_width(), rgb_alignment);
 
 		json local_rgb_network_config = rgb_network;
 		local_rgb_network_config["n_input_dims"] = m_rgb_network_input_width;
 		local_rgb_network_config["n_output_dims"] = 3;
-		m_rgb_network.reset(tcnn::create_network<T>(local_rgb_network_config));
+		m_rgb_network.reset(create_network<T>(local_rgb_network_config));
+
+		m_density_model = std::make_shared<NetworkWithInputEncoding<T>>(m_pos_encoding, m_density_network);
 	}
 
 	virtual ~NerfNetwork() { }
 
-	void inference_mixed_precision_impl(cudaStream_t stream, const tcnn::GPUMatrixDynamic<float>& input, tcnn::GPUMatrixDynamic<T>& output, bool use_inference_params = true) override {
+	void inference_mixed_precision_impl(cudaStream_t stream, const GPUMatrixDynamic<float>& input, GPUMatrixDynamic<T>& output, bool use_inference_params = true) override {
 		uint32_t batch_size = input.n();
-		tcnn::GPUMatrixDynamic<T> density_network_input{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
-		tcnn::GPUMatrixDynamic<T> rgb_network_input{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()};
+		GPUMatrixDynamic<T> density_network_input{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
+		GPUMatrixDynamic<T> rgb_network_input{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()};
 
-		tcnn::GPUMatrixDynamic<T> density_network_output = rgb_network_input.slice_rows(0, m_density_network->padded_output_width());
-		tcnn::GPUMatrixDynamic<T> rgb_network_output{output.data(), m_rgb_network->padded_output_width(), batch_size, output.layout()};
+		GPUMatrixDynamic<T> density_network_output = rgb_network_input.slice_rows(0, m_density_network->padded_output_width());
+		GPUMatrixDynamic<T> rgb_network_output{output.data(), m_rgb_network->padded_output_width(), batch_size, output.layout()};
 
 		m_pos_encoding->inference_mixed_precision(
 			stream,
@@ -127,12 +129,12 @@ class NerfNetwork : public tcnn::Network<float, T> {
 
 		m_rgb_network->inference_mixed_precision(stream, rgb_network_input, rgb_network_output, use_inference_params);
 
-		tcnn::linear_kernel(extract_density<T>, 0, stream,
+		linear_kernel(extract_density<T>, 0, stream,
 			batch_size,
-			density_network_output.layout() == tcnn::AoS ? density_network_output.stride() : 1,
-			output.layout() == tcnn::AoS ? padded_output_width() : 1,
+			density_network_output.layout() == AoS ? density_network_output.stride() : 1,
+			output.layout() == AoS ? padded_output_width() : 1,
 			density_network_output.data(),
-			output.data() + 3 * (output.layout() == tcnn::AoS ? 1 : batch_size)
+			output.data() + 3 * (output.layout() == AoS ? 1 : batch_size)
 		);
 	}
 
@@ -140,14 +142,14 @@ class NerfNetwork : public tcnn::Network<float, T> {
 		return m_density_network->padded_output_width();
 	}
 
-	std::unique_ptr<tcnn::Context> forward_impl(cudaStream_t stream, const tcnn::GPUMatrixDynamic<float>& input, tcnn::GPUMatrixDynamic<T>* output = nullptr, bool use_inference_params = false, bool prepare_input_gradients = false) override {
+	std::unique_ptr<Context> forward_impl(cudaStream_t stream, const GPUMatrixDynamic<float>& input, GPUMatrixDynamic<T>* output = nullptr, bool use_inference_params = false, bool prepare_input_gradients = false) override {
 		// Make sure our temporary buffers have the correct size for the given batch size
 		uint32_t batch_size = input.n();
 
 		auto forward = std::make_unique<ForwardContext>();
 
-		forward->density_network_input = tcnn::GPUMatrixDynamic<T>{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
-		forward->rgb_network_input = tcnn::GPUMatrixDynamic<T>{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()};
+		forward->density_network_input = GPUMatrixDynamic<T>{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
+		forward->rgb_network_input = GPUMatrixDynamic<T>{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()};
 
 		forward->pos_encoding_ctx = m_pos_encoding->forward(
 			stream,
@@ -170,14 +172,14 @@ class NerfNetwork : public tcnn::Network<float, T> {
 		);
 
 		if (output) {
-			forward->rgb_network_output = tcnn::GPUMatrixDynamic<T>{output->data(), m_rgb_network->padded_output_width(), batch_size, output->layout()};
+			forward->rgb_network_output = GPUMatrixDynamic<T>{output->data(), m_rgb_network->padded_output_width(), batch_size, output->layout()};
 		}
 
 		forward->rgb_network_ctx = m_rgb_network->forward(stream, forward->rgb_network_input, output ? &forward->rgb_network_output : nullptr, use_inference_params, prepare_input_gradients);
 
 		if (output) {
-			tcnn::linear_kernel(extract_density<T>, 0, stream,
-				batch_size, m_dir_encoding->preferred_output_layout() == tcnn::AoS ? forward->density_network_output.stride() : 1, padded_output_width(), forward->density_network_output.data(), output->data()+3
+			linear_kernel(extract_density<T>, 0, stream,
+				batch_size, m_dir_encoding->preferred_output_layout() == AoS ? forward->density_network_output.stride() : 1, padded_output_width(), forward->density_network_output.data(), output->data()+3
 			);
 		}
 
@@ -186,33 +188,33 @@ class NerfNetwork : public tcnn::Network<float, T> {
 
 	void backward_impl(
 		cudaStream_t stream,
-		const tcnn::Context& ctx,
-		const tcnn::GPUMatrixDynamic<float>& input,
-		const tcnn::GPUMatrixDynamic<T>& output,
-		const tcnn::GPUMatrixDynamic<T>& dL_doutput,
-		tcnn::GPUMatrixDynamic<float>* dL_dinput = nullptr,
+		const Context& ctx,
+		const GPUMatrixDynamic<float>& input,
+		const GPUMatrixDynamic<T>& output,
+		const GPUMatrixDynamic<T>& dL_doutput,
+		GPUMatrixDynamic<float>* dL_dinput = nullptr,
 		bool use_inference_params = false,
-		tcnn::EGradientMode param_gradients_mode = tcnn::EGradientMode::Overwrite
+		GradientMode param_gradients_mode = GradientMode::Overwrite
 	) override {
 		const auto& forward = dynamic_cast<const ForwardContext&>(ctx);
 
 		// Make sure our teporary buffers have the correct size for the given batch size
 		uint32_t batch_size = input.n();
 
-		tcnn::GPUMatrix<T> dL_drgb{m_rgb_network->padded_output_width(), batch_size, stream};
+		GPUMatrix<T> dL_drgb{m_rgb_network->padded_output_width(), batch_size, stream};
 		CUDA_CHECK_THROW(cudaMemsetAsync(dL_drgb.data(), 0, dL_drgb.n_bytes(), stream));
-		tcnn::linear_kernel(extract_rgb<T>, 0, stream,
+		linear_kernel(extract_rgb<T>, 0, stream,
 			batch_size*3, dL_drgb.m(), dL_doutput.m(), dL_doutput.data(), dL_drgb.data()
 		);
 
-		const tcnn::GPUMatrixDynamic<T> rgb_network_output{(T*)output.data(), m_rgb_network->padded_output_width(), batch_size, output.layout()};
-		tcnn::GPUMatrixDynamic<T> dL_drgb_network_input{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()};
+		const GPUMatrixDynamic<T> rgb_network_output{(T*)output.data(), m_rgb_network->padded_output_width(), batch_size, output.layout()};
+		GPUMatrixDynamic<T> dL_drgb_network_input{m_rgb_network_input_width, batch_size, stream, m_dir_encoding->preferred_output_layout()};
 		m_rgb_network->backward(stream, *forward.rgb_network_ctx, forward.rgb_network_input, rgb_network_output, dL_drgb, &dL_drgb_network_input, use_inference_params, param_gradients_mode);
 
 		// Backprop through dir encoding if it is trainable or if we need input gradients
 		if (m_dir_encoding->n_params() > 0 || dL_dinput) {
-			tcnn::GPUMatrixDynamic<T> dL_ddir_encoding_output = dL_drgb_network_input.slice_rows(m_density_network->padded_output_width(), m_dir_encoding->padded_output_width());
-			tcnn::GPUMatrixDynamic<float> dL_ddir_encoding_input;
+			GPUMatrixDynamic<T> dL_ddir_encoding_output = dL_drgb_network_input.slice_rows(m_density_network->padded_output_width(), m_dir_encoding->padded_output_width());
+			GPUMatrixDynamic<float> dL_ddir_encoding_input;
 			if (dL_dinput) {
 				dL_ddir_encoding_input = dL_dinput->slice_rows(m_dir_offset, m_dir_encoding->input_width());
 			}
@@ -229,25 +231,25 @@ class NerfNetwork : public tcnn::Network<float, T> {
 			);
 		}
 
-		tcnn::GPUMatrixDynamic<T> dL_ddensity_network_output = dL_drgb_network_input.slice_rows(0, m_density_network->padded_output_width());
-		tcnn::linear_kernel(add_density_gradient<T>, 0, stream,
+		GPUMatrixDynamic<T> dL_ddensity_network_output = dL_drgb_network_input.slice_rows(0, m_density_network->padded_output_width());
+		linear_kernel(add_density_gradient<T>, 0, stream,
 			batch_size,
 			dL_doutput.m(),
 			dL_doutput.data(),
-			dL_ddensity_network_output.layout() == tcnn::RM ? 1 : dL_ddensity_network_output.stride(),
+			dL_ddensity_network_output.layout() == RM ? 1 : dL_ddensity_network_output.stride(),
 			dL_ddensity_network_output.data()
 		);
 
-		tcnn::GPUMatrixDynamic<T> dL_ddensity_network_input;
+		GPUMatrixDynamic<T> dL_ddensity_network_input;
 		if (m_pos_encoding->n_params() > 0 || dL_dinput) {
-			dL_ddensity_network_input = tcnn::GPUMatrixDynamic<T>{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
+			dL_ddensity_network_input = GPUMatrixDynamic<T>{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
 		}
 
 		m_density_network->backward(stream, *forward.density_network_ctx, forward.density_network_input, forward.density_network_output, dL_ddensity_network_output, dL_ddensity_network_input.data() ? &dL_ddensity_network_input : nullptr, use_inference_params, param_gradients_mode);
 
 		// Backprop through pos encoding if it is trainable or if we need input gradients
 		if (dL_ddensity_network_input.data()) {
-			tcnn::GPUMatrixDynamic<float> dL_dpos_encoding_input;
+			GPUMatrixDynamic<float> dL_dpos_encoding_input;
 			if (dL_dinput) {
 				dL_dpos_encoding_input = dL_dinput->slice_rows(0, m_pos_encoding->input_width());
 			}
@@ -265,26 +267,19 @@ class NerfNetwork : public tcnn::Network<float, T> {
 		}
 	}
 
-	void density(cudaStream_t stream, const tcnn::GPUMatrixDynamic<float>& input, tcnn::GPUMatrixDynamic<T>& output, bool use_inference_params = true) {
-		if (input.layout() != tcnn::CM) {
+	void density(cudaStream_t stream, const GPUMatrixDynamic<float>& input, GPUMatrixDynamic<T>& output, bool use_inference_params = true) {
+		if (input.layout() != CM) {
 			throw std::runtime_error("NerfNetwork::density input must be in column major format.");
 		}
 
 		uint32_t batch_size = output.n();
-		tcnn::GPUMatrixDynamic<T> density_network_input{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
+		GPUMatrixDynamic<T> density_network_input{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
 
-		m_pos_encoding->inference_mixed_precision(
-			stream,
-			input.slice_rows(0, m_pos_encoding->input_width()),
-			density_network_input,
-			use_inference_params
-		);
-
-		m_density_network->inference_mixed_precision(stream, density_network_input, output, use_inference_params);
+		m_density_model->inference_mixed_precision(stream, input.slice_rows(0, m_pos_encoding->input_width()), output, use_inference_params);
 	}
 
-	std::unique_ptr<tcnn::Context> density_forward(cudaStream_t stream, const tcnn::GPUMatrixDynamic<float>& input, tcnn::GPUMatrixDynamic<T>* output = nullptr, bool use_inference_params = false, bool prepare_input_gradients = false) {
-		if (input.layout() != tcnn::CM) {
+	std::unique_ptr<Context> density_forward(cudaStream_t stream, const GPUMatrixDynamic<float>& input, GPUMatrixDynamic<T>* output = nullptr, bool use_inference_params = false, bool prepare_input_gradients = false) {
+		if (input.layout() != CM) {
 			throw std::runtime_error("NerfNetwork::density_forward input must be in column major format.");
 		}
 
@@ -293,7 +288,7 @@ class NerfNetwork : public tcnn::Network<float, T> {
 
 		auto forward = std::make_unique<ForwardContext>();
 
-		forward->density_network_input = tcnn::GPUMatrixDynamic<T>{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
+		forward->density_network_input = GPUMatrixDynamic<T>{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
 
 		forward->pos_encoding_ctx = m_pos_encoding->forward(
 			stream,
@@ -304,7 +299,7 @@ class NerfNetwork : public tcnn::Network<float, T> {
 		);
 
 		if (output) {
-			forward->density_network_output = tcnn::GPUMatrixDynamic<T>{output->data(), m_density_network->padded_output_width(), batch_size, output->layout()};
+			forward->density_network_output = GPUMatrixDynamic<T>{output->data(), m_density_network->padded_output_width(), batch_size, output->layout()};
 		}
 
 		forward->density_network_ctx = m_density_network->forward(stream, forward->density_network_input, output ? &forward->density_network_output : nullptr, use_inference_params, prepare_input_gradients);
@@ -314,15 +309,15 @@ class NerfNetwork : public tcnn::Network<float, T> {
 
 	void density_backward(
 		cudaStream_t stream,
-		const tcnn::Context& ctx,
-		const tcnn::GPUMatrixDynamic<float>& input,
-		const tcnn::GPUMatrixDynamic<T>& output,
-		const tcnn::GPUMatrixDynamic<T>& dL_doutput,
-		tcnn::GPUMatrixDynamic<float>* dL_dinput = nullptr,
+		const Context& ctx,
+		const GPUMatrixDynamic<float>& input,
+		const GPUMatrixDynamic<T>& output,
+		const GPUMatrixDynamic<T>& dL_doutput,
+		GPUMatrixDynamic<float>* dL_dinput = nullptr,
 		bool use_inference_params = false,
-		tcnn::EGradientMode param_gradients_mode = tcnn::EGradientMode::Overwrite
+		GradientMode param_gradients_mode = GradientMode::Overwrite
 	) {
-		if (input.layout() != tcnn::CM || (dL_dinput && dL_dinput->layout() != tcnn::CM)) {
+		if (input.layout() != CM || (dL_dinput && dL_dinput->layout() != CM)) {
 			throw std::runtime_error("NerfNetwork::density_backward input must be in column major format.");
 		}
 
@@ -331,16 +326,16 @@ class NerfNetwork : public tcnn::Network<float, T> {
 		// Make sure our temporary buffers have the correct size for the given batch size
 		uint32_t batch_size = input.n();
 
-		tcnn::GPUMatrixDynamic<T> dL_ddensity_network_input;
+		GPUMatrixDynamic<T> dL_ddensity_network_input;
 		if (m_pos_encoding->n_params() > 0 || dL_dinput) {
-			dL_ddensity_network_input = tcnn::GPUMatrixDynamic<T>{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
+			dL_ddensity_network_input = GPUMatrixDynamic<T>{m_pos_encoding->padded_output_width(), batch_size, stream, m_pos_encoding->preferred_output_layout()};
 		}
 
 		m_density_network->backward(stream, *forward.density_network_ctx, forward.density_network_input, output, dL_doutput, dL_ddensity_network_input.data() ? &dL_ddensity_network_input : nullptr, use_inference_params, param_gradients_mode);
 
 		// Backprop through pos encoding if it is trainable or if we need input gradients
 		if (dL_ddensity_network_input.data()) {
-			tcnn::GPUMatrixDynamic<float> dL_dpos_encoding_input;
+			GPUMatrixDynamic<float> dL_dpos_encoding_input;
 			if (dL_dinput) {
 				dL_dpos_encoding_input = dL_dinput->slice_rows(0, m_pos_encoding->input_width());
 			}
@@ -359,6 +354,8 @@ class NerfNetwork : public tcnn::Network<float, T> {
 	}
 
 	void set_params_impl(T* params, T* inference_params, T* gradients) override {
+		m_density_model->set_params(params, inference_params, gradients);
+
 		size_t offset = 0;
 		m_density_network->set_params(params + offset, inference_params + offset, gradients + offset);
 		offset += m_density_network->n_params();
@@ -373,7 +370,7 @@ class NerfNetwork : public tcnn::Network<float, T> {
 		offset += m_dir_encoding->n_params();
 	}
 
-	void initialize_params(tcnn::pcg32& rnd, float* params_full_precision, float scale = 1) override {
+	void initialize_params(pcg32& rnd, float* params_full_precision, float scale = 1) override {
 		m_density_network->initialize_params(rnd, params_full_precision, scale);
 		params_full_precision += m_density_network->n_params();
 
@@ -434,7 +431,7 @@ class NerfNetwork : public tcnn::Network<float, T> {
 		return m_density_network->num_forward_activations() + m_rgb_network->num_forward_activations() + 2;
 	}
 
-	std::pair<const T*, tcnn::MatrixLayout> forward_activations(const tcnn::Context& ctx, uint32_t layer) const override {
+	std::pair<const T*, MatrixLayout> forward_activations(const Context& ctx, uint32_t layer) const override {
 		const auto& forward = dynamic_cast<const ForwardContext&>(ctx);
 		if (layer == 0) {
 			return {forward.density_network_input.data(), m_pos_encoding->preferred_output_layout()};
@@ -447,23 +444,23 @@ class NerfNetwork : public tcnn::Network<float, T> {
 		}
 	}
 
-	const std::shared_ptr<tcnn::Encoding<T>>& pos_encoding() const {
+	const std::shared_ptr<Encoding<T>>& pos_encoding() const {
 		return m_pos_encoding;
 	}
 
-	const std::shared_ptr<tcnn::Encoding<T>>& dir_encoding() const {
+	const std::shared_ptr<Encoding<T>>& dir_encoding() const {
 		return m_dir_encoding;
 	}
 
-	const std::shared_ptr<tcnn::Network<T>>& density_network() const {
+	const std::shared_ptr<Network<T>>& density_network() const {
 		return m_density_network;
 	}
 
-	const std::shared_ptr<tcnn::Network<T>>& rgb_network() const {
+	const std::shared_ptr<Network<T>>& rgb_network() const {
 		return m_rgb_network;
 	}
 
-	tcnn::json hyperparams() const override {
+	json hyperparams() const override {
 		json density_network_hyperparams = m_density_network->hyperparams();
 		density_network_hyperparams["n_output_dims"] = m_density_network->padded_output_width();
 		return {
@@ -476,10 +473,13 @@ class NerfNetwork : public tcnn::Network<float, T> {
 	}
 
 private:
-	std::shared_ptr<tcnn::Network<T>> m_density_network;
-	std::shared_ptr<tcnn::Network<T>> m_rgb_network;
-	std::shared_ptr<tcnn::Encoding<T>> m_pos_encoding;
-	std::shared_ptr<tcnn::Encoding<T>> m_dir_encoding;
+	std::shared_ptr<Network<T>> m_density_network;
+	std::shared_ptr<Network<T>> m_rgb_network;
+	std::shared_ptr<Encoding<T>> m_pos_encoding;
+	std::shared_ptr<Encoding<T>> m_dir_encoding;
+
+	// Aggregates m_pos_encoding and m_density_network
+	std::shared_ptr<NetworkWithInputEncoding<T>> m_density_model;
 
 	uint32_t m_rgb_network_input_width;
 	uint32_t m_n_pos_dims;
@@ -488,11 +488,11 @@ class NerfNetwork : public tcnn::Network<float, T> {
 	uint32_t m_dir_offset;
 
 	// // Storage of forward pass data
-	struct ForwardContext : public tcnn::Context {
-		tcnn::GPUMatrixDynamic<T> density_network_input;
-		tcnn::GPUMatrixDynamic<T> density_network_output;
-		tcnn::GPUMatrixDynamic<T> rgb_network_input;
-		tcnn::GPUMatrix<T> rgb_network_output;
+	struct ForwardContext : public Context {
+		GPUMatrixDynamic<T> density_network_input;
+		GPUMatrixDynamic<T> density_network_output;
+		GPUMatrixDynamic<T> rgb_network_input;
+		GPUMatrix<T> rgb_network_output;
 
 		std::unique_ptr<Context> pos_encoding_ctx;
 		std::unique_ptr<Context> dir_encoding_ctx;
@@ -502,4 +502,4 @@ class NerfNetwork : public tcnn::Network<float, T> {
 	};
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/openxr_hmd.h b/include/neural-graphics-primitives/openxr_hmd.h
index ed3f5613e..f7150d794 100644
--- a/include/neural-graphics-primitives/openxr_hmd.h
+++ b/include/neural-graphics-primitives/openxr_hmd.h
@@ -43,7 +43,7 @@
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers" //TODO: XR struct are uninitiaized apart from their type
 #endif
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 enum class EEnvironmentBlendMode {
 	Opaque = XR_ENVIRONMENT_BLEND_MODE_OPAQUE,
@@ -284,7 +284,7 @@ class OpenXRHMD {
 	const bool m_print_reference_spaces = false;
 };
 
-NGP_NAMESPACE_END
+}
 
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
diff --git a/include/neural-graphics-primitives/random_val.cuh b/include/neural-graphics-primitives/random_val.cuh
index 08314df64..4399b7a73 100644
--- a/include/neural-graphics-primitives/random_val.cuh
+++ b/include/neural-graphics-primitives/random_val.cuh
@@ -19,13 +19,13 @@
 
 #include <neural-graphics-primitives/common.h>
 
-#include <tiny-cuda-nn/random.h>
+#include <pcg32/pcg32.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
-using default_rng_t = tcnn::default_rng_t;
+using default_rng_t = pcg32;
 
-inline constexpr float PI() { return 3.14159265358979323846f; }
+inline constexpr NGP_HOST_DEVICE float PI() { return 3.14159265358979323846f; }
 
 template <typename RNG>
 inline __host__ __device__ float random_val(RNG& rng) {
@@ -55,14 +55,14 @@ inline __host__ __device__ vec3 cylindrical_to_dir(const vec2& p) {
 
 inline __host__ __device__ vec2 dir_to_cylindrical(const vec3& d) {
 	const float cos_theta = fminf(fmaxf(-d.z, -1.0f), 1.0f);
-	float phi = std::atan2(d.y, d.x);
+	float phi = atan2(d.y, d.x);
 	return {(cos_theta + 1.0f) / 2.0f, (phi / (2.0f * PI())) + 0.5f};
 }
 
 inline __host__ __device__ vec2 dir_to_spherical(const vec3& d) {
 	const float cos_theta = fminf(fmaxf(d.z, -1.0f), 1.0f);
 	const float theta = acosf(cos_theta);
-	float phi = std::atan2(d.y, d.x);
+	float phi = atan2(d.y, d.x);
 	return {theta, phi};
 }
 
@@ -324,5 +324,5 @@ inline __host__ __device__ vec2 ld_random_pixel_offset(const uint32_t spp) {
 	return offset;
 }
 
-NGP_NAMESPACE_END
+}
 
diff --git a/include/neural-graphics-primitives/render_buffer.h b/include/neural-graphics-primitives/render_buffer.h
index 64c6e9ca3..6bbb4bad3 100644
--- a/include/neural-graphics-primitives/render_buffer.h
+++ b/include/neural-graphics-primitives/render_buffer.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/common_host.h>
 #include <neural-graphics-primitives/common_device.cuh>
 #include <neural-graphics-primitives/dlss.h>
 
@@ -23,7 +23,7 @@
 #include <memory>
 #include <vector>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 typedef unsigned int GLenum;
 typedef int          GLint;
@@ -306,9 +306,9 @@ class CudaRenderBuffer {
 
 	ivec2 m_in_resolution = ivec2(0);
 
-	tcnn::GPUMemory<vec4> m_frame_buffer;
-	tcnn::GPUMemory<float> m_depth_buffer;
-	tcnn::GPUMemory<vec4> m_accumulate_buffer;
+	GPUMemory<vec4> m_frame_buffer;
+	GPUMemory<float> m_depth_buffer;
+	GPUMemory<vec4> m_accumulate_buffer;
 
 	std::shared_ptr<Buffer2D<uint8_t>> m_hidden_area_mask = nullptr;
 
@@ -316,4 +316,4 @@ class CudaRenderBuffer {
 	std::shared_ptr<SurfaceProvider> m_depth_target;
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/sdf.h b/include/neural-graphics-primitives/sdf.h
index 3b375a982..0d28f10e9 100644
--- a/include/neural-graphics-primitives/sdf.h
+++ b/include/neural-graphics-primitives/sdf.h
@@ -18,7 +18,7 @@
 
 #include <tiny-cuda-nn/gpu_memory.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 struct SdfPayload {
 	vec3 dir;
@@ -28,7 +28,7 @@ struct SdfPayload {
 };
 
 struct RaysSdfSoa {
-#if defined(__NVCC__) || (defined(__clang__) && defined(__CUDA__))
+#if defined(__CUDACC__) || (defined(__clang__) && defined(__CUDA__))
 	void copy_from_other_async(uint32_t n_elements, const RaysSdfSoa& other, cudaStream_t stream) {
 		CUDA_CHECK_THROW(cudaMemcpyAsync(pos, other.pos, n_elements * sizeof(vec3), cudaMemcpyDeviceToDevice, stream));
 		CUDA_CHECK_THROW(cudaMemcpyAsync(normal, other.normal, n_elements * sizeof(vec3), cudaMemcpyDeviceToDevice, stream));
@@ -67,8 +67,8 @@ struct BRDFParams {
 	float sheen=0.f;
 	float clearcoat=0.f;
 	float clearcoat_gloss=0.f;
-	vec3 basecolor=vec3(0.8f,0.8f,0.8f);
-	vec3 ambientcolor=vec3(0.f,0.f,0.f);
+	vec3 basecolor = {0.8f, 0.8f, 0.8f};
+	vec3 ambientcolor = {0.0f, 0.0f, 0.0f};
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/shared_queue.h b/include/neural-graphics-primitives/shared_queue.h
index 87629f7e8..61b5f91f2 100644
--- a/include/neural-graphics-primitives/shared_queue.h
+++ b/include/neural-graphics-primitives/shared_queue.h
@@ -17,7 +17,7 @@
 #include <deque>
 #include <mutex>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 class ICallable {
 public:
@@ -117,4 +117,4 @@ class SharedQueue {
 	std::condition_variable mDataCondition;
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/takikawa_encoding.cuh b/include/neural-graphics-primitives/takikawa_encoding.cuh
index 04bad458c..344150542 100644
--- a/include/neural-graphics-primitives/takikawa_encoding.cuh
+++ b/include/neural-graphics-primitives/takikawa_encoding.cuh
@@ -23,19 +23,19 @@
 #include <tiny-cuda-nn/encoding.h>
 #include <tiny-cuda-nn/random.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 template <typename T, uint32_t N_FEATURES_PER_LEVEL>
 __global__ void kernel_takikawa(
 	const uint32_t num_elements,
 	const uint32_t n_levels,
 	const uint32_t starting_level,
-	const tcnn::InterpolationType interpolation_type,
+	const InterpolationType interpolation_type,
 	const TriangleOctreeNode* octree_nodes,
 	const TriangleOctreeDualNode* octree_dual_nodes,
 	const T* __restrict__ grid,
-	const tcnn::MatrixView<const float> data_in,
-	tcnn::MatrixView<T> data_out,
+	const MatrixView<const float> data_in,
+	MatrixView<T> data_out,
 	float* __restrict__ dy_dx
 ) {
 	uint32_t n_features = N_FEATURES_PER_LEVEL * n_levels;
@@ -61,7 +61,7 @@ __global__ void kernel_takikawa(
 
 			vec3 pos_derivative;
 
-			if (interpolation_type == tcnn::InterpolationType::Linear) {
+			if (interpolation_type == InterpolationType::Linear) {
 				NGP_PRAGMA_UNROLL
 				for (uint32_t dim = 0; dim < 3; ++dim) {
 					pos_derivative[dim] = 1.0f;
@@ -69,14 +69,14 @@ __global__ void kernel_takikawa(
 			} else {
 				NGP_PRAGMA_UNROLL
 				for (uint32_t dim = 0; dim < 3; ++dim) {
-					pos_derivative[dim] = tcnn::smoothstep_derivative(pos[dim]);
-					pos[dim] = tcnn::smoothstep(pos[dim]);
+					pos_derivative[dim] = smoothstep_derivative(pos[dim]);
+					pos[dim] = smoothstep(pos[dim]);
 				}
 			}
 
 			if (data_out) {
 				// Tri-linear interpolation
-				tcnn::vector_t<T, N_FEATURES_PER_LEVEL, true> result = {(T)0.0f};
+				tvec<T, N_FEATURES_PER_LEVEL, sizeof(T) * N_FEATURES_PER_LEVEL> result = {(T)0.0f};
 
 				NGP_PRAGMA_UNROLL
 				for (uint32_t idx = 0; idx < 8; ++idx) {
@@ -92,7 +92,7 @@ __global__ void kernel_takikawa(
 					}
 
 					int param_idx = node.vertices[idx] * N_FEATURES_PER_LEVEL;
-					result = fma((T)weight, *(tcnn::vector_t<T, N_FEATURES_PER_LEVEL, true>*)&grid[param_idx], result);
+					result = fma((T)weight, *(tvec<T, N_FEATURES_PER_LEVEL, sizeof(T) * N_FEATURES_PER_LEVEL>*)&grid[param_idx], result);
 				}
 
 				NGP_PRAGMA_UNROLL
@@ -107,7 +107,7 @@ __global__ void kernel_takikawa(
 
 				NGP_PRAGMA_UNROLL
 				for (uint32_t grad_dim = 0; grad_dim < 3; ++grad_dim) {
-					tcnn::vector_fullp_t<N_FEATURES_PER_LEVEL> grad = {0.0f};
+					vec<N_FEATURES_PER_LEVEL> grad = {0.0f};
 
 					NGP_PRAGMA_UNROLL
 					for (uint32_t idx = 0; idx < 4; ++idx) {
@@ -127,11 +127,11 @@ __global__ void kernel_takikawa(
 						}
 
 						int param_idx = node.vertices[child_idx] * N_FEATURES_PER_LEVEL;
-						auto val_left = *(tcnn::vector_t<T, N_FEATURES_PER_LEVEL>*)&grid[param_idx];
+						auto val_left = *(tvec<T, N_FEATURES_PER_LEVEL>*)&grid[param_idx];
 
 						child_idx |= 1 << grad_dim;
 						param_idx = node.vertices[child_idx] * N_FEATURES_PER_LEVEL;
-						auto val_right = *(tcnn::vector_t<T, N_FEATURES_PER_LEVEL>*)&grid[param_idx];
+						auto val_right = *(tvec<T, N_FEATURES_PER_LEVEL>*)&grid[param_idx];
 
 						NGP_PRAGMA_UNROLL
 						for (uint32_t feature = 0; feature < N_FEATURES_PER_LEVEL; ++feature) {
@@ -140,7 +140,7 @@ __global__ void kernel_takikawa(
 					}
 
 					const uint32_t fan_out_grad = n_features * 3;
-					*(tcnn::vector_fullp_t<N_FEATURES_PER_LEVEL>*)&dy_dx[i * fan_out_grad + level * N_FEATURES_PER_LEVEL + grad_dim * n_features] = grad;
+					*(vec<N_FEATURES_PER_LEVEL>*)&dy_dx[i * fan_out_grad + level * N_FEATURES_PER_LEVEL + grad_dim * n_features] = grad;
 				}
 			}
 		}
@@ -162,9 +162,9 @@ template <typename T>
 __global__ void kernel_takikawa_backward_input(
 	const uint32_t num_elements,
 	const uint32_t num_grid_features,
-	const tcnn::MatrixView<const T> dL_dy,
+	const MatrixView<const T> dL_dy,
 	const float* __restrict__ dy_dx,
-	tcnn::MatrixView<float> dL_dx
+	MatrixView<float> dL_dx
 ) {
 	const uint32_t input_index = threadIdx.x + blockIdx.x * blockDim.x;
 	if (input_index >= num_elements) return;
@@ -186,12 +186,12 @@ __global__ void kernel_takikawa_backward(
 	const uint32_t num_elements,
 	const uint32_t n_levels,
 	const uint32_t starting_level,
-	const tcnn::InterpolationType interpolation_type,
+	const InterpolationType interpolation_type,
 	const TriangleOctreeNode* octree_nodes,
 	const TriangleOctreeDualNode* octree_dual_nodes,
-	GRAD_T* __restrict__ params_gradient,
-	const tcnn::MatrixView<const float> data_in,
-	const tcnn::MatrixView<const T> dL_dy
+	GRAD_T* __restrict__ param_gradients,
+	const MatrixView<const float> data_in,
+	const MatrixView<const T> dL_dy
 ) {
 	uint32_t i = blockIdx.x * blockDim.x + threadIdx.x;
 	const uint32_t encoded_index = i * N_FEATURES_PER_LEVEL * n_levels;
@@ -212,14 +212,14 @@ __global__ void kernel_takikawa_backward(
 			}
 			level -= starting_level;
 
-			if (interpolation_type == tcnn::InterpolationType::Smoothstep) {
+			if (interpolation_type == InterpolationType::Smoothstep) {
 				NGP_PRAGMA_UNROLL
 				for (uint32_t dim = 0; dim < 3; ++dim) {
-					pos[dim] = tcnn::smoothstep(pos[dim]);
+					pos[dim] = smoothstep(pos[dim]);
 				}
 			}
 
-			tcnn::vector_t<T, N_FEATURES_PER_LEVEL> grad;
+			tvec<T, N_FEATURES_PER_LEVEL> grad;
 
 			NGP_PRAGMA_UNROLL
 			for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
@@ -248,7 +248,7 @@ __global__ void kernel_takikawa_backward(
 					NGP_PRAGMA_UNROLL
 					for (uint32_t feature = 0; feature < N_FEATURES_PER_LEVEL; feature += 2) {
 						__half2 v = {(__half)((float)grad[feature] * weight), (__half)((float)grad[feature+1] * weight)};
-						atomicAdd((__half2*)&params_gradient[param_idx + feature], v);
+						atomicAdd((__half2*)&param_gradients[param_idx + feature], v);
 					}
 				} else
 #endif
@@ -259,7 +259,7 @@ __global__ void kernel_takikawa_backward(
 					} else {
 						NGP_PRAGMA_UNROLL
 						for (uint32_t f = 0; f < N_FEATURES_PER_LEVEL; ++f) {
-							atomicAdd((float*)&params_gradient[param_idx], (float)grad[f] * weight);
+							atomicAdd((float*)&param_gradients[param_idx], (float)grad[f] * weight);
 						}
 					}
 				}
@@ -269,7 +269,7 @@ __global__ void kernel_takikawa_backward(
 }
 
 template <typename T, uint32_t N_FEATURES_PER_LEVEL=8>
-class TakikawaEncoding : public tcnn::Encoding<T> {
+class TakikawaEncoding : public Encoding<T> {
 public:
 #if TCNN_MIN_GPU_ARCH >= 60
 	// The GPUs that we tested this on do not have an efficient 1D fp16
@@ -284,7 +284,7 @@ public:
 	using grad_t = float;
 #endif
 
-	TakikawaEncoding(uint32_t starting_level, std::shared_ptr<TriangleOctree> octree, tcnn::InterpolationType interpolation_type)
+	TakikawaEncoding(uint32_t starting_level, std::shared_ptr<TriangleOctree> octree, InterpolationType interpolation_type)
 		: m_starting_level{starting_level}, m_octree{octree}, m_interpolation_type{interpolation_type} {
 
 		if (m_starting_level >= m_octree->depth()) {
@@ -300,10 +300,10 @@ public:
 
 	virtual ~TakikawaEncoding() { }
 
-	std::unique_ptr<tcnn::Context> forward_impl(
+	std::unique_ptr<Context> forward_impl(
 		cudaStream_t stream,
-		const tcnn::GPUMatrixDynamic<float>& input,
-		tcnn::GPUMatrixDynamic<T>* output = nullptr,
+		const GPUMatrixDynamic<float>& input,
+		GPUMatrixDynamic<T>* output = nullptr,
 		bool use_inference_params = false,
 		bool prepare_input_gradients = false
 	) override {
@@ -314,10 +314,10 @@ public:
 		}
 
 		if (prepare_input_gradients) {
-			forward->dy_dx = tcnn::GPUMatrix<float>{3 * N_FEATURES_PER_LEVEL * n_levels(), input.n(), stream};
+			forward->dy_dx = GPUMatrix<float>{3 * N_FEATURES_PER_LEVEL * n_levels(), input.n(), stream};
 		}
 
-		tcnn::linear_kernel(kernel_takikawa<T, N_FEATURES_PER_LEVEL>, 0, stream,
+		linear_kernel(kernel_takikawa<T, N_FEATURES_PER_LEVEL>, 0, stream,
 			input.n(),
 			n_levels(),
 			m_starting_level,
@@ -326,7 +326,7 @@ public:
 			m_octree->dual_nodes_gpu(),
 			use_inference_params ? this->inference_params() : this->params(),
 			input.view(),
-			output ? output->view() : tcnn::MatrixView<T>{},
+			output ? output->view() : MatrixView<T>{},
 			forward->dy_dx.data()
 		);
 
@@ -335,13 +335,13 @@ public:
 
 	void backward_impl(
 		cudaStream_t stream,
-		const tcnn::Context& ctx,
-		const tcnn::GPUMatrixDynamic<float>& input,
-		const tcnn::GPUMatrixDynamic<T>& output,
-		const tcnn::GPUMatrixDynamic<T>& dL_doutput,
-		tcnn::GPUMatrixDynamic<float>* dL_dinput = nullptr,
+		const Context& ctx,
+		const GPUMatrixDynamic<float>& input,
+		const GPUMatrixDynamic<T>& output,
+		const GPUMatrixDynamic<T>& dL_doutput,
+		GPUMatrixDynamic<float>* dL_dinput = nullptr,
 		bool use_inference_params = false,
-		tcnn::EGradientMode param_gradients_mode = tcnn::EGradientMode::Overwrite
+		GradientMode param_gradients_mode = GradientMode::Overwrite
 	) override {
 		const uint32_t num_elements = input.n();
 		if (padded_output_width() == 0 || num_elements == 0) {
@@ -350,37 +350,37 @@ public:
 
 		const auto& forward = dynamic_cast<const ForwardContext&>(ctx);
 
-		if (param_gradients_mode != tcnn::EGradientMode::Ignore) {
+		if (param_gradients_mode != GradientMode::Ignore) {
 			// We accumulate gradients with grad_t precision, which, for performance reasons, is not always T.
 			// If not, accumulate in a temporary buffer and cast later.
-			grad_t* params_gradient;
-			tcnn::GPUMemoryArena::Allocation params_gradient_tmp;
+			grad_t* param_gradients;
+			GPUMemoryArena::Allocation param_gradients_tmp;
 
 			if (!std::is_same<grad_t, T>::value) {
-				params_gradient_tmp = tcnn::allocate_workspace(stream, n_params() * sizeof(grad_t));
-				params_gradient = (grad_t*)params_gradient_tmp.data();
+				param_gradients_tmp = allocate_workspace(stream, n_params() * sizeof(grad_t));
+				param_gradients = (grad_t*)param_gradients_tmp.data();
 			} else {
-				params_gradient = (grad_t*)this->gradients();
+				param_gradients = (grad_t*)this->gradients();
 			}
 
-			if (param_gradients_mode == tcnn::EGradientMode::Overwrite) {
-				CUDA_CHECK_THROW(cudaMemsetAsync(params_gradient, 0, n_params() * sizeof(grad_t), stream));
+			if (param_gradients_mode == GradientMode::Overwrite) {
+				CUDA_CHECK_THROW(cudaMemsetAsync(param_gradients, 0, n_params() * sizeof(grad_t), stream));
 			}
 
-			tcnn::linear_kernel(kernel_takikawa_backward<T, grad_t, N_FEATURES_PER_LEVEL>, 0, stream,
+			linear_kernel(kernel_takikawa_backward<T, grad_t, N_FEATURES_PER_LEVEL>, 0, stream,
 				num_elements,
 				n_levels(),
 				m_starting_level,
 				m_interpolation_type,
 				m_octree->nodes_gpu(),
 				m_octree->dual_nodes_gpu(),
-				params_gradient,
+				param_gradients,
 				input.view(),
 				dL_doutput.view()
 			);
 
 			if (!std::is_same<grad_t, T>::value) {
-				parallel_for_gpu(stream, n_params(), [grad=this->gradients(), grad_tmp=params_gradient] __device__ (size_t i) {
+				parallel_for_gpu(stream, n_params(), [grad=this->gradients(), grad_tmp=param_gradients] __device__ (size_t i) {
 					grad[i] = (T)grad_tmp[i];
 				});
 			}
@@ -388,7 +388,7 @@ public:
 
 		// Gradient computation w.r.t. input
 		if (dL_dinput) {
-			tcnn::linear_kernel(kernel_takikawa_backward_input<T>, 0, stream,
+			linear_kernel(kernel_takikawa_backward_input<T>, 0, stream,
 				num_elements * input_width(),
 				N_FEATURES_PER_LEVEL * n_levels(),
 				dL_doutput.view(),
@@ -424,9 +424,9 @@ public:
 
 	void set_params_impl(T* params, T* inference_params, T* gradients) override { }
 
-	void initialize_params(tcnn::pcg32& rnd, float* params_full_precision, float scale = 1) override {
+	void initialize_params(pcg32& rnd, float* params_full_precision, float scale = 1) override {
 		// Initialize the encoding from the GPU, because the number of parameters can be quite large.
-		tcnn::generate_random_uniform<float>(rnd, n_params(), params_full_precision, -1e-4f * scale, 1e-4f * scale);
+		generate_random_uniform<float>(rnd, n_params(), params_full_precision, -1e-4f * scale, 1e-4f * scale);
 	}
 
 	size_t n_params() const override {
@@ -437,11 +437,11 @@ public:
 		return m_octree->depth() - m_starting_level;
 	}
 
-	tcnn::MatrixLayout preferred_output_layout() const override {
-		return tcnn::AoS;
+	MatrixLayout preferred_output_layout() const override {
+		return AoS;
 	}
 
-	tcnn::json hyperparams() const override {
+	json hyperparams() const override {
 		return {
 			{"otype", "Takikawa"},
 			{"starting_level", m_starting_level},
@@ -450,8 +450,8 @@ public:
 	}
 
 private:
-	struct ForwardContext : public tcnn::Context {
-		tcnn::GPUMatrix<float> dy_dx;
+	struct ForwardContext : public Context {
+		GPUMatrix<float> dy_dx;
 	};
 
 	uint32_t m_starting_level;
@@ -462,7 +462,7 @@ private:
 	uint32_t m_n_to_pad = 0;
 
 	std::shared_ptr<TriangleOctree> m_octree;
-	tcnn::InterpolationType m_interpolation_type;
+	InterpolationType m_interpolation_type;
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h
index e8eae9726..75aa028dd 100644
--- a/include/neural-graphics-primitives/testbed.h
+++ b/include/neural-graphics-primitives/testbed.h
@@ -44,16 +44,16 @@
 
 struct GLFWwindow;
 
-TCNN_NAMESPACE_BEGIN
+namespace tcnn {
 template <typename T> class Loss;
 template <typename T> class Optimizer;
 template <typename T> class Encoding;
 template <typename T, typename PARAMS_T> class Network;
 template <typename T, typename PARAMS_T, typename COMPUTE_T> class Trainer;
 template <uint32_t N_DIMS, uint32_t RANK, typename T> class TrainableBuffer;
-TCNN_NAMESPACE_END
+}
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 template <typename T> class NerfNetwork;
 class TriangleOctree;
@@ -137,7 +137,7 @@ class Testbed {
 		float m_shadow_sharpness = 2048.f;
 		bool m_trace_shadow_rays = false;
 
-		tcnn::GPUMemoryArena::Allocation m_scratch_alloc;
+		GPUMemoryArena::Allocation m_scratch_alloc;
 	};
 
 	class NerfTracer {
@@ -177,7 +177,7 @@ class Testbed {
 		);
 
 		uint32_t trace(
-			NerfNetwork<precision_t>& network,
+			const std::shared_ptr<NerfNetwork<network_precision_t>>& network,
 			const BoundingBox& render_aabb,
 			const mat3& render_aabb_to_local,
 			const BoundingBox& train_aabb,
@@ -208,12 +208,12 @@ class Testbed {
 	private:
 		RaysNerfSoa m_rays[2];
 		RaysNerfSoa m_rays_hit;
-		precision_t* m_network_output;
+		network_precision_t* m_network_output;
 		float* m_network_input;
 		uint32_t* m_hit_counter;
 		uint32_t* m_alive_counter;
 		uint32_t m_n_rays_initialized = 0;
-		tcnn::GPUMemoryArena::Allocation m_scratch_alloc;
+		GPUMemoryArena::Allocation m_scratch_alloc;
 	};
 
 	class FiniteDifferenceNormalsApproximator {
@@ -234,7 +234,7 @@ class Testbed {
 		float* dist_dy_neg;
 		float* dist_dz_neg;
 
-		tcnn::GPUMemoryArena::Allocation m_scratch_alloc;
+		GPUMemoryArena::Allocation m_scratch_alloc;
 	};
 
 	struct LevelStats {
@@ -257,7 +257,7 @@ class Testbed {
 	// underflow (round to zero) in the gradient computations. Hence,
 	// scale the loss (and thereby gradients) up by this factor and
 	// divide it out in the optimizer later on.
-	static constexpr float LOSS_SCALE = 128.0f;
+	static constexpr float LOSS_SCALE() { return default_loss_scale<network_precision_t>(); }
 
 	struct NetworkDims {
 		uint32_t n_input;
@@ -278,11 +278,11 @@ class Testbed {
 
 	class CudaDevice;
 
-	const float* get_inference_extra_dims(cudaStream_t stream) const;
 	void render_nerf(
 		cudaStream_t stream,
+		CudaDevice& device,
 		const CudaRenderBufferView& render_buffer,
-		NerfNetwork<precision_t>& nerf_network,
+		const std::shared_ptr<NerfNetwork<network_precision_t>>& nerf_network,
 		const uint8_t* density_grid_bitfield,
 		const vec2& focal_length,
 		const mat4x3& camera_matrix0,
@@ -396,7 +396,6 @@ class Testbed {
 	void last_training_view();
 	void previous_training_view();
 	void next_training_view();
-	void add_training_views_to_camera_path();
 	void set_camera_to_training_view(int trainview);
 	void reset_camera();
 	bool keyboard_event();
@@ -406,9 +405,9 @@ class Testbed {
 	void mark_density_grid_in_sphere_empty(const vec3& pos, float radius, cudaStream_t stream);
 
 	struct NerfCounters {
-		tcnn::GPUMemory<uint32_t> numsteps_counter; // number of steps each ray took
-		tcnn::GPUMemory<uint32_t> numsteps_counter_compacted; // number of steps each ray took
-		tcnn::GPUMemory<float> loss;
+		GPUMemory<uint32_t> numsteps_counter; // number of steps each ray took
+		GPUMemory<uint32_t> numsteps_counter_compacted; // number of steps each ray took
+		GPUMemory<float> loss;
 
 		uint32_t rays_per_batch = 1<<12;
 		uint32_t n_rays_total = 0;
@@ -438,13 +437,13 @@ class Testbed {
 	vec2 render_screen_center(const vec2& screen_center) const;
 	void optimise_mesh_step(uint32_t N_STEPS);
 	void compute_mesh_vertex_colors();
-	tcnn::GPUMemory<float> get_density_on_grid(ivec3 res3d, const BoundingBox& aabb, const mat3& render_aabb_to_local); // network version (nerf or sdf)
-	tcnn::GPUMemory<float> get_sdf_gt_on_grid(ivec3 res3d, const BoundingBox& aabb, const mat3& render_aabb_to_local); // sdf gt version (sdf only)
-	tcnn::GPUMemory<vec4> get_rgba_on_grid(ivec3 res3d, vec3 ray_dir, bool voxel_centers, float depth, bool density_as_alpha = false);
+	GPUMemory<float> get_density_on_grid(ivec3 res3d, const BoundingBox& aabb, const mat3& render_aabb_to_local); // network version (nerf or sdf)
+	GPUMemory<float> get_sdf_gt_on_grid(ivec3 res3d, const BoundingBox& aabb, const mat3& render_aabb_to_local); // sdf gt version (sdf only)
+	GPUMemory<vec4> get_rgba_on_grid(ivec3 res3d, vec3 ray_dir, bool voxel_centers, float depth, bool density_as_alpha = false);
 	int marching_cubes(ivec3 res3d, const BoundingBox& render_aabb, const mat3& render_aabb_to_local, float thresh);
 
 	float get_depth_from_renderbuffer(const CudaRenderBuffer& render_buffer, const vec2& uv);
-	vec3 get_3d_pos_from_pixel(const CudaRenderBuffer& render_buffer, const ivec2& focus_pixel);
+	vec3 get_3d_pos_from_pixel(const CudaRenderBuffer& render_buffer, const vec2& focus_pixel);
 	void autofocus();
 	size_t n_params();
 	size_t first_encoder_param();
@@ -467,7 +466,6 @@ class Testbed {
 	void init_vr();
 	void update_vr_performance_settings();
 	void apply_camera_smoothing(float elapsed_ms);
-	int find_best_training_view(int default_view);
 	bool begin_frame();
 	void handle_user_input();
 	vec3 vr_to_world(const vec3& pos) const;
@@ -501,6 +499,7 @@ class Testbed {
 	ivec3 compute_and_save_png_slices(const fs::path& filename, int res, BoundingBox aabb = {}, float thresh = 2.5f, float density_range = 4.f, bool flip_y_and_z_axes = false);
 
 	fs::path root_dir();
+	void set_root_dir(const fs::path& dir);
 
 	////////////////////////////////////////////////////////////////
 	// marching cubes related state
@@ -512,14 +511,14 @@ class Testbed {
 		float density_amount = 128.f;
 		float inflate_amount = 1.f;
 		bool optimize_mesh = false;
-		tcnn::GPUMemory<vec3> verts;
-		tcnn::GPUMemory<vec3> vert_normals;
-		tcnn::GPUMemory<vec3> vert_colors;
-		tcnn::GPUMemory<vec4> verts_smoothed; // homogenous
-		tcnn::GPUMemory<uint32_t> indices;
-		tcnn::GPUMemory<vec3> verts_gradient;
+		GPUMemory<vec3> verts;
+		GPUMemory<vec3> vert_normals;
+		GPUMemory<vec3> vert_colors;
+		GPUMemory<vec4> verts_smoothed; // homogenous
+		GPUMemory<uint32_t> indices;
+		GPUMemory<vec3> verts_gradient;
 		std::shared_ptr<TrainableBuffer<3, 1, float>> trainable_verts;
-		std::shared_ptr<tcnn::Optimizer<float>> verts_optimizer;
+		std::shared_ptr<Optimizer<float>> verts_optimizer;
 
 		void clear() {
 			indices={};
@@ -566,8 +565,8 @@ class Testbed {
 	float m_ndc_znear = 1.0f / 32.0f;
 	float m_ndc_zfar = 128.0f;
 
-	mat4x3 m_camera = mat4x3(1.0f);
-	mat4x3 m_smoothed_camera = mat4x3(1.0f);
+	mat4x3 m_camera = mat4x3::identity();
+	mat4x3 m_smoothed_camera = mat4x3::identity();
 	size_t m_render_skip_due_to_lack_of_camera_movement_counter = 0;
 
 	bool m_fps_camera = false;
@@ -639,10 +638,10 @@ class Testbed {
 			int n_images_for_training_prev = 0; // how many images we saw last time we updated the density grid
 
 			struct ErrorMap {
-				tcnn::GPUMemory<float> data;
-				tcnn::GPUMemory<float> cdf_x_cond_y;
-				tcnn::GPUMemory<float> cdf_y;
-				tcnn::GPUMemory<float> cdf_img;
+				GPUMemory<float> data;
+				GPUMemory<float> cdf_x_cond_y;
+				GPUMemory<float> cdf_y;
+				GPUMemory<float> cdf_img;
 				std::vector<float> pmf_img_cpu;
 				ivec2 resolution = {16, 16};
 				ivec2 cdf_resolution = {16, 16};
@@ -650,31 +649,31 @@ class Testbed {
 			} error_map;
 
 			std::vector<TrainingXForm> transforms;
-			tcnn::GPUMemory<TrainingXForm> transforms_gpu;
+			GPUMemory<TrainingXForm> transforms_gpu;
 
 			std::vector<vec3> cam_pos_gradient;
-			tcnn::GPUMemory<vec3> cam_pos_gradient_gpu;
+			GPUMemory<vec3> cam_pos_gradient_gpu;
 
 			std::vector<vec3> cam_rot_gradient;
-			tcnn::GPUMemory<vec3> cam_rot_gradient_gpu;
+			GPUMemory<vec3> cam_rot_gradient_gpu;
 
-			tcnn::GPUMemory<vec3> cam_exposure_gpu;
+			GPUMemory<vec3> cam_exposure_gpu;
 			std::vector<vec3> cam_exposure_gradient;
-			tcnn::GPUMemory<vec3> cam_exposure_gradient_gpu;
+			GPUMemory<vec3> cam_exposure_gradient_gpu;
 
 			vec2 cam_focal_length_gradient = vec2(0.0f);
-			tcnn::GPUMemory<vec2> cam_focal_length_gradient_gpu;
+			GPUMemory<vec2> cam_focal_length_gradient_gpu;
 
 			std::vector<AdamOptimizer<vec3>> cam_exposure;
 			std::vector<AdamOptimizer<vec3>> cam_pos_offset;
 			std::vector<RotationAdamOptimizer> cam_rot_offset;
 			AdamOptimizer<vec2> cam_focal_length_offset = AdamOptimizer<vec2>(0.0f);
 
-			tcnn::GPUMemory<float> extra_dims_gpu; // if the model demands a latent code per training image, we put them in here.
-			tcnn::GPUMemory<float> extra_dims_gradient_gpu;
+			GPUMemory<float> extra_dims_gpu; // if the model demands a latent code per training image, we put them in here.
+			GPUMemory<float> extra_dims_gradient_gpu;
 			std::vector<VarAdamOptimizer> extra_dims_opt;
 
-			void reset_extra_dims(default_rng_t &rng);
+			std::vector<float> get_extra_dims_cpu(int trainview) const;
 
 			float extrinsic_l2_reg = 1e-4f;
 			float extrinsic_learning_rate = 1e-3f;
@@ -715,7 +714,7 @@ class Testbed {
 
 			float depth_supervision_lambda = 0.f;
 
-			tcnn::GPUMemory<float> sharpness_grid;
+			GPUMemory<float> sharpness_grid;
 
 			void set_camera_intrinsics(int frame_idx, float fx, float fy = 0.0f, float cx = -0.5f, float cy = -0.5f, float k1 = 0.0f, float k2 = 0.0f, float p1 = 0.0f, float p2 = 0.0f, float k3 = 0.0f, float k4 = 0.0f, bool is_fisheye = false);
 			void set_camera_extrinsics_rolling_shutter(int frame_idx, mat4x3 camera_to_world_start, mat4x3 camera_to_world_end, const vec4& rolling_shutter, bool convert_to_ngp = true);
@@ -732,10 +731,10 @@ class Testbed {
 			void export_camera_extrinsics(const fs::path& path, bool export_extrinsics_in_quat_format = true);
 		} training = {};
 
-		tcnn::GPUMemory<float> density_grid; // NERF_GRIDSIZE()^3 grid of EMA smoothed densities from the network
-		tcnn::GPUMemory<uint8_t> density_grid_bitfield;
+		GPUMemory<float> density_grid; // NERF_GRIDSIZE()^3 grid of EMA smoothed densities from the network
+		GPUMemory<uint8_t> density_grid_bitfield;
 		uint8_t* get_density_grid_bitfield_mip(uint32_t mip);
-		tcnn::GPUMemory<float> density_grid_mean;
+		GPUMemory<float> density_grid_mean;
 		uint32_t density_grid_ema_step = 0;
 
 		uint32_t max_cascade = 0;
@@ -744,7 +743,12 @@ class Testbed {
 		ENerfActivation density_activation = ENerfActivation::Exponential;
 
 		vec3 light_dir = vec3(0.5f);
-		uint32_t extra_dim_idx_for_inference = 0; // which training image's latent code should be presented at inference time
+		// which training image's latent code should be used for rendering
+		int rendering_extra_dims_from_training_view = 0;
+		GPUMemory<float> rendering_extra_dims;
+
+		void reset_extra_dims(default_rng_t &rng);
+		const float* get_rendering_extra_dims(cudaStream_t stream) const;
 
 		int show_accel = -1;
 
@@ -757,10 +761,15 @@ class Testbed {
 		Lens render_lens = {};
 
 		float render_min_transmittance = 0.01f;
+		bool render_gbuffer_hard_edges = false;
 
 		float glow_y_cutoff = 0.f;
 		int glow_mode = 0;
 
+		int find_closest_training_view(mat4x3 pose) const;
+		void set_rendering_extra_dims_from_training_view(int trainview);
+		void set_rendering_extra_dims(const std::vector<float>& vals);
+		std::vector<float> get_rendering_extra_dims_cpu() const;
 	} m_nerf;
 
 	struct Sdf {
@@ -776,11 +785,11 @@ class Testbed {
 		EMeshSdfMode mesh_sdf_mode = EMeshSdfMode::Raystab;
 		float mesh_scale;
 
-		tcnn::GPUMemory<Triangle> triangles_gpu;
+		GPUMemory<Triangle> triangles_gpu;
 		std::vector<Triangle> triangles_cpu;
 		std::vector<float> triangle_weights;
 		DiscreteDistribution triangle_distribution;
-		tcnn::GPUMemory<float> triangle_cdf;
+		GPUMemory<float> triangle_cdf;
 		std::shared_ptr<TriangleBvh> triangle_bvh; // unique_ptr
 
 		bool uses_takikawa_encoding = false;
@@ -788,7 +797,7 @@ class Testbed {
 		int octree_depth_target = 0; // we duplicate this state so that you can waggle the slider without triggering it immediately
 		std::shared_ptr<TriangleOctree> triangle_octree;
 
-		tcnn::GPUMemory<float> brick_data;
+		GPUMemory<float> brick_data;
 		uint32_t brick_res = 0;
 		uint32_t brick_level = 10;
 		uint32_t brick_quantise_bits = 0;
@@ -801,7 +810,7 @@ class Testbed {
 		double iou = 0.0;
 		float iou_decay = 0.0f;
 		bool calculate_iou_online = false;
-		tcnn::GPUMemory<uint32_t> iou_counter;
+		GPUMemory<uint32_t> iou_counter;
 		struct Training {
 			size_t idx = 0;
 			size_t size = 0;
@@ -809,11 +818,11 @@ class Testbed {
 			bool did_generate_more_training_data = false;
 			bool generate_sdf_data_online = true;
 			float surface_offset_scale = 1.0f;
-			tcnn::GPUMemory<vec3> positions;
-			tcnn::GPUMemory<vec3> positions_shuffled;
-			tcnn::GPUMemory<float> distances;
-			tcnn::GPUMemory<float> distances_shuffled;
-			tcnn::GPUMemory<vec3> perturbations;
+			GPUMemory<vec3> positions;
+			GPUMemory<vec3> positions_shuffled;
+			GPUMemory<float> distances;
+			GPUMemory<float> distances_shuffled;
+			GPUMemory<vec3> perturbations;
 		} training = {};
 	} m_sdf;
 
@@ -823,18 +832,18 @@ class Testbed {
 	};
 
 	struct Image {
-		tcnn::GPUMemory<char> data;
+		GPUMemory<char> data;
 
 		EDataType type = EDataType::Float;
 		ivec2 resolution = ivec2(0);
 
-		tcnn::GPUMemory<vec2> render_coords;
-		tcnn::GPUMemory<vec3> render_out;
+		GPUMemory<vec2> render_coords;
+		GPUMemory<vec3> render_out;
 
 		struct Training {
-			tcnn::GPUMemory<float> positions_tmp;
-			tcnn::GPUMemory<vec2> positions;
-			tcnn::GPUMemory<vec3> targets;
+			GPUMemory<float> positions_tmp;
+			GPUMemory<vec2> positions;
+			GPUMemory<vec3> targets;
 
 			bool snap_to_pixel_centers = true;
 			bool linear_colors = false;
@@ -853,22 +862,22 @@ class Testbed {
 		float albedo = 0.95f;
 		float scattering = 0.f;
 		float inv_distance_scale = 100.f;
-		tcnn::GPUMemory<char> nanovdb_grid;
-		tcnn::GPUMemory<uint8_t> bitgrid;
+		GPUMemory<char> nanovdb_grid;
+		GPUMemory<uint8_t> bitgrid;
 		float global_majorant = 1.f;
-		vec3 world2index_offset = {0, 0, 0};
+		vec3 world2index_offset = {0.0f, 0.0f, 0.0f};
 		float world2index_scale = 1.f;
 
 		struct Training {
-			tcnn::GPUMemory<vec3> positions = {};
-			tcnn::GPUMemory<vec4> targets = {};
+			GPUMemory<vec3> positions = {};
+			GPUMemory<vec4> targets = {};
 		} training = {};
 
 		// tracing state
-		tcnn::GPUMemory<vec3> pos[2] = {};
-		tcnn::GPUMemory<VolPayload> payload[2] = {};
-		tcnn::GPUMemory<uint32_t> hit_counter = {};
-		tcnn::GPUMemory<vec4> radiance_and_density;
+		GPUMemory<vec3> pos[2] = {};
+		GPUMemory<VolPayload> payload[2] = {};
+		GPUMemory<uint32_t> hit_counter = {};
+		GPUMemory<vec4> radiance_and_density;
 	} m_volume;
 
 	float m_camera_velocity = 1.0f;
@@ -886,7 +895,7 @@ class Testbed {
 	BoundingBox m_raw_aabb;
 	BoundingBox m_aabb;
 	BoundingBox m_render_aabb;
-	mat3 m_render_aabb_to_local = mat3(1.0f);
+	mat3 m_render_aabb_to_local = mat3::identity();
 
 	mat4x3 crop_box(bool nerf_space) const;
 	std::vector<vec3> crop_box_corners(bool nerf_space) const;
@@ -915,9 +924,9 @@ class Testbed {
 		ivec2 full_resolution = {1, 1};
 		int visualized_dimension = 0;
 
-		mat4x3 camera0 = mat4x3(1.0f);
-		mat4x3 camera1 = mat4x3(1.0f);
-		mat4x3 prev_camera = mat4x3(1.0f);
+		mat4x3 camera0 = mat4x3::identity();
+		mat4x3 camera1 = mat4x3::identity();
+		mat4x3 prev_camera = mat4x3::identity();
 
 		Foveation foveation;
 		Foveation prev_foveation;
@@ -957,15 +966,15 @@ class Testbed {
 	vec3 m_parallax_shift = {0.0f, 0.0f, 0.0f}; // to shift the viewer's origin by some amount in camera space
 
 	// CUDA stuff
-	tcnn::StreamAndEvent m_stream;
+	StreamAndEvent m_stream;
 
 	// Hashgrid encoding analysis
 	float m_quant_percent = 0.f;
 	std::vector<LevelStats> m_level_stats;
 	std::vector<LevelStats> m_first_layer_column_stats;
-	int m_n_levels = 0;
+	uint32_t m_n_levels = 0;
 	uint32_t m_n_features_per_level = 0;
-	int m_histo_level = 0; // collect a histogram for this level
+	uint32_t m_histo_level = 0; // collect a histogram for this level
 	uint32_t m_base_grid_resolution;
 	float m_per_level_scale;
 	float m_histo[257] = {};
@@ -983,19 +992,14 @@ class Testbed {
 	class CudaDevice {
 	public:
 		struct Data {
-			tcnn::GPUMemory<uint8_t> density_grid_bitfield;
+			GPUMemory<uint8_t> density_grid_bitfield;
 			uint8_t* density_grid_bitfield_ptr;
 
-			tcnn::GPUMemory<precision_t> params;
+			GPUMemory<network_precision_t> params;
 			std::shared_ptr<Buffer2D<uint8_t>> hidden_area_mask;
 		};
 
-		CudaDevice(int id, bool is_primary) : m_id{id}, m_is_primary{is_primary} {
-			auto guard = device_guard();
-			m_stream = std::make_unique<tcnn::StreamAndEvent>();
-			m_data = std::make_unique<Data>();
-			m_render_worker = std::make_unique<ThreadPool>(is_primary ? 0u : 1u);
-		}
+		CudaDevice(int id, bool is_primary);
 
 		CudaDevice(const CudaDevice&) = delete;
 		CudaDevice& operator=(const CudaDevice&) = delete;
@@ -1003,17 +1007,7 @@ class Testbed {
 		CudaDevice(CudaDevice&&) = default;
 		CudaDevice& operator=(CudaDevice&&) = default;
 
-		tcnn::ScopeGuard device_guard() {
-			int prev_device = tcnn::cuda_device();
-			if (prev_device == m_id) {
-				return {};
-			}
-
-			tcnn::set_cuda_device(m_id);
-			return tcnn::ScopeGuard{[prev_device]() {
-				tcnn::set_cuda_device(prev_device);
-			}};
-		}
+		ScopeGuard device_guard();
 
 		int id() const {
 			return m_id;
@@ -1024,11 +1018,11 @@ class Testbed {
 		}
 
 		std::string name() const {
-			return tcnn::cuda_device_name(m_id);
+			return cuda_device_name(m_id);
 		}
 
 		int compute_capability() const {
-			return tcnn::cuda_compute_capability(m_id);
+			return cuda_compute_capability(m_id);
 		}
 
 		cudaStream_t stream() const {
@@ -1064,17 +1058,14 @@ class Testbed {
 			m_dirty = value;
 		}
 
-		void set_network(const std::shared_ptr<tcnn::Network<float, precision_t>>& network) {
-			m_network = network;
-		}
-
-		void set_nerf_network(const std::shared_ptr<NerfNetwork<precision_t>>& nerf_network);
+		void set_network(const std::shared_ptr<Network<float, network_precision_t>>& network);
+		void set_nerf_network(const std::shared_ptr<NerfNetwork<network_precision_t>>& nerf_network);
 
-		const std::shared_ptr<tcnn::Network<float, precision_t>>& network() const {
+		const std::shared_ptr<Network<float, network_precision_t>>& network() const {
 			return m_network;
 		}
 
-		const std::shared_ptr<NerfNetwork<precision_t>>& nerf_network() const {
+		const std::shared_ptr<NerfNetwork<network_precision_t>>& nerf_network() const {
 			return m_nerf_network;
 		}
 
@@ -1087,7 +1078,7 @@ class Testbed {
 		}
 
 		template <class F>
-		auto enqueue_task(F&& f) -> std::future<std::result_of_t <F()>> {
+		auto enqueue_task(F&& f) -> std::future<std::result_of_t<F()>> {
 			if (is_primary()) {
 				return std::async(std::launch::deferred, std::forward<F>(f));
 			} else {
@@ -1098,7 +1089,7 @@ class Testbed {
 	private:
 		int m_id;
 		bool m_is_primary;
-		std::unique_ptr<tcnn::StreamAndEvent> m_stream;
+		std::unique_ptr<StreamAndEvent> m_stream;
 		struct Event {
 			Event() {
 				CUDA_CHECK_THROW(cudaEventCreate(&event));
@@ -1122,8 +1113,8 @@ class Testbed {
 		std::unique_ptr<Data> m_data;
 		CudaRenderBufferView m_render_buffer_view = {};
 
-		std::shared_ptr<tcnn::Network<float, precision_t>> m_network;
-		std::shared_ptr<NerfNetwork<precision_t>> m_nerf_network;
+		std::shared_ptr<Network<float, network_precision_t>> m_network;
+		std::shared_ptr<NerfNetwork<network_precision_t>> m_nerf_network;
 
 		bool m_dirty = true;
 
@@ -1131,7 +1122,7 @@ class Testbed {
 	};
 
 	void sync_device(CudaRenderBuffer& render_buffer, CudaDevice& device);
-	tcnn::ScopeGuard use_device(cudaStream_t stream, CudaRenderBuffer& render_buffer, CudaDevice& device);
+	ScopeGuard use_device(cudaStream_t stream, CudaRenderBuffer& render_buffer, CudaDevice& device);
 	void set_all_devices_dirty();
 
 	std::vector<CudaDevice> m_devices;
@@ -1155,7 +1146,6 @@ class Testbed {
 
 	nlohmann::json m_network_config;
 
-
 	default_rng_t m_rng;
 
 	CudaRenderBuffer m_windowless_render_surface{std::make_shared<CudaSurface2D>()};
@@ -1164,16 +1154,16 @@ class Testbed {
 	uint32_t network_num_forward_activations() const;
 
 	// Network & training stuff
-	std::shared_ptr<tcnn::Loss<precision_t>> m_loss;
-	std::shared_ptr<tcnn::Optimizer<precision_t>> m_optimizer;
-	std::shared_ptr<tcnn::Encoding<precision_t>> m_encoding;
-	std::shared_ptr<tcnn::Network<float, precision_t>> m_network;
-	std::shared_ptr<tcnn::Trainer<float, precision_t, precision_t>> m_trainer;
+	std::shared_ptr<Loss<network_precision_t>> m_loss;
+	std::shared_ptr<Optimizer<network_precision_t>> m_optimizer;
+	std::shared_ptr<Encoding<network_precision_t>> m_encoding;
+	std::shared_ptr<Network<float, network_precision_t>> m_network;
+	std::shared_ptr<Trainer<float, network_precision_t, network_precision_t>> m_trainer;
 
 	struct TrainableEnvmap {
-		std::shared_ptr<tcnn::Optimizer<float>> optimizer;
+		std::shared_ptr<Optimizer<float>> optimizer;
 		std::shared_ptr<TrainableBuffer<4, 2, float>> envmap;
-		std::shared_ptr<tcnn::Trainer<float, float, float>> trainer;
+		std::shared_ptr<Trainer<float, float, float>> trainer;
 
 		ivec2 resolution;
 		ELossType loss_type;
@@ -1196,9 +1186,9 @@ class Testbed {
 	} m_envmap;
 
 	struct TrainableDistortionMap {
-		std::shared_ptr<tcnn::Optimizer<float>> optimizer;
+		std::shared_ptr<Optimizer<float>> optimizer;
 		std::shared_ptr<TrainableBuffer<2, 2, float>> map;
-		std::shared_ptr<tcnn::Trainer<float, float, float>> trainer;
+		std::shared_ptr<Trainer<float, float, float>> trainer;
 		ivec2 resolution;
 
 		Buffer2DView<const vec2> inference_view() const {
@@ -1217,7 +1207,8 @@ class Testbed {
 			return {(const vec2*)map->params(), resolution};
 		}
 	} m_distortion;
-	std::shared_ptr<NerfNetwork<precision_t>> m_nerf_network;
+
+	std::shared_ptr<NerfNetwork<network_precision_t>> m_nerf_network;
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/thread_pool.h b/include/neural-graphics-primitives/thread_pool.h
index 879888306..099b72177 100644
--- a/include/neural-graphics-primitives/thread_pool.h
+++ b/include/neural-graphics-primitives/thread_pool.h
@@ -22,7 +22,7 @@
 #include <thread>
 #include <vector>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 template <typename T>
 void wait_all(T&& futures) {
@@ -106,4 +106,4 @@ class ThreadPool {
 	std::condition_variable m_task_queue_completed_condition;
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/tinyexr_wrapper.h b/include/neural-graphics-primitives/tinyexr_wrapper.h
index db690368b..8a82ba91b 100644
--- a/include/neural-graphics-primitives/tinyexr_wrapper.h
+++ b/include/neural-graphics-primitives/tinyexr_wrapper.h
@@ -16,12 +16,12 @@
 
 #pragma once
 
-#include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/common_host.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 void save_exr(const float* data, int width, int height, int nChannels, int channelStride, const fs::path& path);
 void load_exr(float** data, int* width, int* height, const fs::path& path);
 __half* load_exr_to_gpu(int* width, int* height, const fs::path& path, bool fix_premult);
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/tinyobj_loader_wrapper.h b/include/neural-graphics-primitives/tinyobj_loader_wrapper.h
index b94859bf4..260bebea2 100644
--- a/include/neural-graphics-primitives/tinyobj_loader_wrapper.h
+++ b/include/neural-graphics-primitives/tinyobj_loader_wrapper.h
@@ -21,8 +21,8 @@
 #include <string>
 #include <vector>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 std::vector<vec3> load_obj(const fs::path& path);
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/trainable_buffer.cuh b/include/neural-graphics-primitives/trainable_buffer.cuh
index dcf24e0c8..8fb03bbbf 100644
--- a/include/neural-graphics-primitives/trainable_buffer.cuh
+++ b/include/neural-graphics-primitives/trainable_buffer.cuh
@@ -24,45 +24,45 @@
 #include <tiny-cuda-nn/gpu_memory.h>
 #include <tiny-cuda-nn/network.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 template <uint32_t N_DIMS, uint32_t RANK, typename T>
-class TrainableBuffer : public tcnn::DifferentiableObject<float, T, T> {
+class TrainableBuffer : public DifferentiableObject<float, T, T> {
 public:
 	template <typename RES>
 	TrainableBuffer(const RES& resolution) {
 		for (uint32_t i = 0; i < RANK; ++i) {
 			m_resolution[i] = resolution[i];
 		}
-		m_params_gradient_weight.resize(n_params());
+		m_param_gradients_weight.resize(n_params());
 	}
 
 	virtual ~TrainableBuffer() { }
 
-	void inference_mixed_precision_impl(cudaStream_t stream, const tcnn::GPUMatrixDynamic<float>& input, tcnn::GPUMatrixDynamic<T>& output, bool use_inference_matrices = true) override {
+	void inference_mixed_precision_impl(cudaStream_t stream, const GPUMatrixDynamic<float>& input, GPUMatrixDynamic<T>& output, bool use_inference_matrices = true) override {
 		throw std::runtime_error{"The trainable buffer does not support inference(). Its content is meant to be used externally."};
 	}
 
-	std::unique_ptr<tcnn::Context> forward_impl(cudaStream_t stream, const tcnn::GPUMatrixDynamic<float>& input, tcnn::GPUMatrixDynamic<T>* output = nullptr, bool use_inference_matrices = false, bool prepare_input_gradients = false) override {
+	std::unique_ptr<Context> forward_impl(cudaStream_t stream, const GPUMatrixDynamic<float>& input, GPUMatrixDynamic<T>* output = nullptr, bool use_inference_matrices = false, bool prepare_input_gradients = false) override {
 		throw std::runtime_error{"The trainable buffer does not support forward(). Its content is meant to be used externally."};
 	}
 
 	void backward_impl(
 		cudaStream_t stream,
-		const tcnn::Context& ctx,
-		const tcnn::GPUMatrixDynamic<float>& input,
-		const tcnn::GPUMatrixDynamic<T>& output,
-		const tcnn::GPUMatrixDynamic<T>& dL_doutput,
-		tcnn::GPUMatrixDynamic<float>* dL_dinput = nullptr,
+		const Context& ctx,
+		const GPUMatrixDynamic<float>& input,
+		const GPUMatrixDynamic<T>& output,
+		const GPUMatrixDynamic<T>& dL_doutput,
+		GPUMatrixDynamic<float>* dL_dinput = nullptr,
 		bool use_inference_matrices = false,
-		tcnn::EGradientMode param_gradients_mode = tcnn::EGradientMode::Overwrite
+		GradientMode param_gradients_mode = GradientMode::Overwrite
 	) override {
 		throw std::runtime_error{"The trainable buffer does not support backward(). Its content is meant to be used externally."};
 	}
 
 	void set_params_impl(T* params, T* inference_params, T* gradients) override { }
 
-	void initialize_params(tcnn::pcg32& rnd, float* params_full_precision, float scale = 1) override {
+	void initialize_params(pcg32& rnd, float* params_full_precision, float scale = 1) override {
 		// Initialize the buffer to zero from the GPU
 		CUDA_CHECK_THROW(cudaMemset(params_full_precision, 0, n_params()*sizeof(float)));
 	}
@@ -96,10 +96,10 @@ public:
 	}
 
 	T* gradient_weights() const {
-		return m_params_gradient_weight.data();
+		return m_param_gradients_weight.data();
 	}
 
-	tcnn::json hyperparams() const override {
+	json hyperparams() const override {
 		return {
 			{"otype", "TrainableBuffer"},
 		};
@@ -107,7 +107,7 @@ public:
 
 private:
 	uint32_t m_resolution[RANK];
-	tcnn::GPUMemory<T> m_params_gradient_weight;
+	GPUMemory<T> m_param_gradients_weight;
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/triangle.cuh b/include/neural-graphics-primitives/triangle.cuh
index 75ed25793..54fa30690 100644
--- a/include/neural-graphics-primitives/triangle.cuh
+++ b/include/neural-graphics-primitives/triangle.cuh
@@ -20,11 +20,11 @@
 
 #include <tiny-cuda-nn/common.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 struct Triangle {
 	NGP_HOST_DEVICE vec3 sample_uniform_position(const vec2& sample) const {
-		float sqrt_x = std::sqrt(sample.x);
+		float sqrt_x = sqrt(sample.x);
 		float factor0 = 1.0f - sqrt_x;
 		float factor1 = sqrt_x * (1.0f - sample.y);
 		float factor2 = sqrt_x * sample.y;
@@ -52,7 +52,7 @@ struct Triangle {
 		float v = d *  dot(q, v1v0);
 		float t = d * -dot(n, rov0);
 		if (u < 0.0f || u > 1.0f || v < 0.0f || (u+v) > 1.0f || t < 0.0f) {
-			t = std::numeric_limits<float>::max(); // No intersection
+			t = std::numeric_limits<float>::max();
 		}
 		return t;
 	}
@@ -74,10 +74,10 @@ struct Triangle {
 			(sign(dot(cross(v21, nor), p1)) + sign(dot(cross(v32, nor), p2)) + sign(dot(cross(v13, nor), p3)) < 2.0f)
 			?
 			// 3 edges
-			std::min({
-				length2(v21 * tcnn::clamp(dot(v21, p1) / length2(v21), 0.0f, 1.0f)-p1),
-				length2(v32 * tcnn::clamp(dot(v32, p2) / length2(v32), 0.0f, 1.0f)-p2),
-				length2(v13 * tcnn::clamp(dot(v13, p3) / length2(v13), 0.0f, 1.0f)-p3),
+			min(vec3{
+				length2(v21 * clamp(dot(v21, p1) / length2(v21), 0.0f, 1.0f)-p1),
+				length2(v32 * clamp(dot(v32, p2) / length2(v32), 0.0f, 1.0f)-p2),
+				length2(v13 * clamp(dot(v13, p3) / length2(v13), 0.0f, 1.0f)-p3),
 			})
 			:
 			// 1 face
@@ -85,7 +85,7 @@ struct Triangle {
 	}
 
 	NGP_HOST_DEVICE float distance(const vec3& pos) const {
-		return std::sqrt(distance_sq(pos));
+		return sqrt(distance_sq(pos));
 	}
 
 	NGP_HOST_DEVICE bool point_in_triangle(const vec3& p) const {
@@ -116,7 +116,7 @@ struct Triangle {
 
 	NGP_HOST_DEVICE vec3 closest_point_to_line(const vec3& a, const vec3& b, const vec3& c) const {
 		float t = dot(c - a, b - a) / dot(b - a, b - a);
-		t = std::max(std::min(t, 1.0f), 0.0f);
+		t = max(min(t, 1.0f), 0.0f);
 		return a + t * (b - a);
 	}
 
@@ -135,7 +135,7 @@ struct Triangle {
 		float mag2 = length2(point - c2);
 		float mag3 = length2(point - c3);
 
-		float min = std::min({mag1, mag2, mag3});
+		float min = tcnn::min(vec3{mag1, mag2, mag3});
 
 		if (min == mag1) {
 			return c1;
@@ -163,13 +163,4 @@ struct Triangle {
 	vec3 a, b, c;
 };
 
-inline std::ostream& operator<<(std::ostream& os, const ngp::Triangle& triangle) {
-	os << "[";
-	os << "a=[" << triangle.a.x << "," << triangle.a.y << "," << triangle.a.z << "], ";
-	os << "b=[" << triangle.b.x << "," << triangle.b.y << "," << triangle.b.z << "], ";
-	os << "c=[" << triangle.c.x << "," << triangle.c.y << "," << triangle.c.z << "]";
-	os << "]";
-	return os;
 }
-
-NGP_NAMESPACE_END
diff --git a/include/neural-graphics-primitives/triangle_bvh.cuh b/include/neural-graphics-primitives/triangle_bvh.cuh
index c859faa4c..814db7bc4 100644
--- a/include/neural-graphics-primitives/triangle_bvh.cuh
+++ b/include/neural-graphics-primitives/triangle_bvh.cuh
@@ -23,7 +23,7 @@
 
 #include <memory>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 struct TriangleBvhNode {
 	BoundingBox bb;
@@ -65,7 +65,7 @@ public:
 	virtual void ray_trace_gpu(uint32_t n_elements, vec3* gpu_positions, vec3* gpu_directions, const Triangle* gpu_triangles, cudaStream_t stream) = 0;
 	virtual bool touches_triangle(const BoundingBox& bb, const Triangle* __restrict__ triangles) const = 0;
 	virtual void build(std::vector<Triangle>& triangles, uint32_t n_primitives_per_leaf) = 0;
-	virtual void build_optix(const tcnn::GPUMemory<Triangle>& triangles, cudaStream_t stream) = 0;
+	virtual void build_optix(const GPUMemory<Triangle>& triangles, cudaStream_t stream) = 0;
 
 	static std::unique_ptr<TriangleBvh> make();
 
@@ -75,8 +75,8 @@ public:
 
 protected:
 	std::vector<TriangleBvhNode> m_nodes;
-	tcnn::GPUMemory<TriangleBvhNode> m_nodes_gpu;
+	GPUMemory<TriangleBvhNode> m_nodes_gpu;
 	TriangleBvh() {};
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/include/neural-graphics-primitives/triangle_octree.cuh b/include/neural-graphics-primitives/triangle_octree.cuh
index 7180a44da..0fddc4039 100644
--- a/include/neural-graphics-primitives/triangle_octree.cuh
+++ b/include/neural-graphics-primitives/triangle_octree.cuh
@@ -23,8 +23,8 @@
 
 namespace std {
 	template<>
-	struct less<u16vec4> {
-		bool operator()(const u16vec4& a, const u16vec4& b) const {
+	struct less<tcnn::u16vec4> {
+		bool operator()(const tcnn::u16vec4& a, const tcnn::u16vec4& b) const {
 			for(size_t i = 0; i < 4; ++i) {
 				if (a[i] < b[i]) return true;
 				if (a[i] > b[i]) return false;
@@ -34,14 +34,14 @@ namespace std {
 	};
 
 	template <>
-	struct hash<u16vec4> {
-		size_t operator()(const u16vec4& x) const {
+	struct hash<tcnn::u16vec4> {
+		size_t operator()(const tcnn::u16vec4& x) const {
 			return (size_t)x.x * 73856093 + (size_t)x.y * 19349663 + (size_t)x.z * 83492791 + (size_t)x.w * 25165843;
 		}
 	};
 }
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 struct TriangleOctreeNode {
 	int children[8];
@@ -180,7 +180,7 @@ public:
 			}
 		};
 
-		generate_dual_coords(m_dual_nodes[0], 0, {0, 0, 0});
+		generate_dual_coords(m_dual_nodes[0], 0, {(uint16_t)0, (uint16_t)0, (uint16_t)0});
 		for (auto& node : m_nodes) {
 			for (uint32_t i = 0; i < 8; ++i) {
 				auto child_idx = node.children[i];
@@ -349,11 +349,11 @@ private:
 	std::vector<TriangleOctreeNode> m_nodes;
 	std::vector<TriangleOctreeDualNode> m_dual_nodes;
 
-	tcnn::GPUMemory<TriangleOctreeNode> m_nodes_gpu;
-	tcnn::GPUMemory<TriangleOctreeDualNode> m_dual_nodes_gpu;
+	GPUMemory<TriangleOctreeNode> m_nodes_gpu;
+	GPUMemory<TriangleOctreeDualNode> m_dual_nodes_gpu;
 
 	uint32_t m_n_vertices = 0;
 	uint32_t m_depth = 0;
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/scripts/run.py b/scripts/run.py
index f60e77e28..263626299 100644
--- a/scripts/run.py
+++ b/scripts/run.py
@@ -204,6 +204,7 @@ def get_scene(scene):
 					tqdm_last_update = now
 
 	if args.save_snapshot:
+		os.makedirs(os.path.dirname(args.save_snapshot), exist_ok=True)
 		testbed.save_snapshot(args.save_snapshot, False)
 
 	if args.test_transforms:
@@ -280,7 +281,7 @@ def get_scene(scene):
 		print(args.screenshot_frames)
 		for idx in args.screenshot_frames:
 			f = ref_transforms["frames"][int(idx)]
-			cam_matrix = f["transform_matrix"]
+			cam_matrix = f.get("transform_matrix", f["transform_matrix_start"])
 			testbed.set_nerf_camera_matrix(np.matrix(cam_matrix)[:-1,:])
 			outname = os.path.join(args.screenshot_dir, os.path.basename(f["file_path"]))
 
diff --git a/src/camera_path.cu b/src/camera_path.cu
index 14ea30001..02e33a57d 100644
--- a/src/camera_path.cu
+++ b/src/camera_path.cu
@@ -26,7 +26,7 @@
 
 using namespace nlohmann;
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 CameraKeyframe lerp(const CameraKeyframe& p0, const CameraKeyframe& p1, float t, float t0, float t1) {
 	t = (t - t0) / (t1 - t0);
@@ -157,7 +157,7 @@ int CameraPath::imgui(char path_filename_buf[1024], float frame_milliseconds, ma
 	if (ImGui::Button("Load")) {
 		try {
 			load(path_filename_buf, first_xform);
-		} catch (std::exception& e) {
+		} catch (const std::exception& e) {
 			ImGui::OpenPopup("Camera path load error");
 			camera_path_load_error_string = std::string{"Failed to load camera path: "} + e.what();
 		}
@@ -259,7 +259,7 @@ int CameraPath::imgui(char path_filename_buf[1024], float frame_milliseconds, ma
 }
 
 bool debug_project(const mat4& proj, vec3 p, ImVec2& o) {
-	vec4 ph(p, 1.0f);
+	vec4 ph{p.x, p.y, p.z, 1.0f};
 	vec4 pa = proj * ph;
 	if (pa.w <= 0.f) {
 		return false;
@@ -323,12 +323,12 @@ bool CameraPath::imgui_viz(ImDrawList* list, mat4 &view2proj, mat4 &world2proj,
 	bool changed = false;
 	// float flx = focal.x;
 	float fly = focal.y;
-	mat4 view2proj_guizmo = transpose(mat4(
+	mat4 view2proj_guizmo = transpose(mat4{
 		fly * 2.0f / aspect, 0.0f, 0.0f, 0.0f,
 		0.0f, -fly * 2.0f, 0.0f, 0.0f,
 		0.0f, 0.0f, (zfar + znear) / (zfar - znear), -(2.0f * zfar * znear) / (zfar - znear),
-		0.0f, 0.0f, 1.0f, 0.0f
-	));
+		0.0f, 0.0f, 1.0f, 0.0f,
+	});
 
 	if (!update_cam_from_path) {
 		ImDrawList* list = ImGui::GetForegroundDrawList();
@@ -350,7 +350,7 @@ bool CameraPath::imgui_viz(ImDrawList* list, mat4 &view2proj, mat4 &world2proj,
 				int i0 = cur_cam_i; while (i0 > 0 && keyframes[cur_cam_i].same_pos_as(keyframes[i0 - 1])) i0--;
 				int i1 = cur_cam_i; while (i1 < keyframes.size() - 1 && keyframes[cur_cam_i].same_pos_as(keyframes[i1 + 1])) i1++;
 				for (int i = i0; i <= i1; ++i) {
-					keyframes[i].T = matrix[3].xyz;
+					keyframes[i].T = matrix[3].xyz();
 					keyframes[i].R = quat(mat3(matrix));
 				}
 				changed=true;
@@ -375,4 +375,4 @@ bool CameraPath::imgui_viz(ImDrawList* list, mat4 &view2proj, mat4 &world2proj,
 }
 #endif //NGP_GUI
 
-NGP_NAMESPACE_END
+}
diff --git a/src/common_device.cu b/src/common_device.cu
deleted file mode 100644
index c430ab617..000000000
--- a/src/common_device.cu
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
- *
- * NVIDIA CORPORATION and its licensors retain all intellectual property
- * and proprietary rights in and to this software, related documentation
- * and any modifications thereto.  Any use, reproduction, disclosure or
- * distribution of this software and related documentation without an express
- * license agreement from NVIDIA CORPORATION is strictly prohibited.
- */
-
-/** @file   common_device.cu
- *  @author Thomas Müller, NVIDIA
- */
-
-#include <neural-graphics-primitives/common_device.cuh>
-#include <neural-graphics-primitives/tinyexr_wrapper.h>
-
-// #include <unsupported/Eigen/MatrixFunctions>
-
-#include <stb_image/stb_image.h>
-
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
-
-
-mat3 so3_log(const mat3& m) {
-	float tr = tcnn::clamp(m[0][0] + m[1][1] + m[2][2], -0.99999f, 1.0f);
-	float radians = acosf((tr - 1.0f) / 2.0f);
-	return radians / sqrt((1.0f + tr) * (3.0f - tr)) * (m - transpose(m));
-}
-
-mat3 so3_exp(const mat3& m) {
-	vec3 axis = {-m[2][1], m[2][0], -m[1][0]};
-	float radians_sq = length2(axis);
-	if (radians_sq == 0.0f) {
-		return mat3(1.0f);
-	}
-
-	float radians = sqrt(radians_sq);
-	return mat3(1.0f) + (sin(radians) / radians) * m + ((1.0f - cos(radians)) / radians_sq) * (m * m);
-}
-
-mat4x3 se3_log(const mat4x3& m) {
-	mat3 omega = so3_log(mat3(m));
-	vec3 axis = {-omega[2][1], omega[2][0], -omega[1][0]};
-	float radians_sq = length2(axis);
-	mat3 inv_trans = mat3(1.0f);
-	if (radians_sq > 0.0f) {
-		float radians = sqrt(radians_sq);
-		inv_trans += -0.5f * omega + ((1.0f - 0.5f * radians * cos(0.5f * radians) / sin(0.5f * radians)) / radians_sq) * (omega * omega);
-	}
-
-	return mat4x3(omega[0], omega[1], omega[2], inv_trans * m[3]);
-}
-
-mat4x3 se3_exp(const mat4x3& m) {
-	mat3 omega{m};
-	vec3 axis = {-omega[2][1], omega[2][0], -omega[1][0]};
-	float radians_sq = length2(axis);
-	mat3 trans = mat3(1.0f);
-	if (radians_sq > 0.0f) {
-		float radians = sqrt(radians_sq);
-		trans += ((1.0f - cos(radians)) / radians_sq) * omega + ((radians - sin(radians)) / (radians * radians_sq)) * (omega * omega);
-	}
-
-	mat3 rot = so3_exp(omega);
-	return mat4x3(rot[0], rot[1], rot[2], trans * m[3]);
-}
-
-mat4 se3_log(const mat4& m) {
-	mat4 result = mat4(se3_log(mat4x3(m)));
-	result[3][3] = 0.0f;
-	return result;
-}
-
-mat4 se3_exp(const mat4& m) {
-	return mat4(se3_exp(mat4x3(m)));
-}
-
-float frobenius_norm(const mat4& m) {
-	return sqrt(length2(m[0]) + length2(m[1]) + length2(m[2]) + length2(m[3]));
-}
-
-mat4 mat_sqrt(const mat4& m, float eps = 1e-10f) {
-	mat4 X = m, Y = mat4(1.0f);
-	for (uint32_t i = 0; i < 32; ++i) {
-		if (frobenius_norm(X * X - m) < eps) {
-			return X;
-		}
-
-		mat4 iX = inverse(X);
-		X = 0.5f * (X + inverse(Y));
-		Y = 0.5f * (Y + iX);
-	}
-
-	return X;
-}
-
-mat4 mat_log_taylor(const mat4& m, uint32_t n_iters) {
-	mat4 result = mat4(0.0f);
-	mat4 cur = m - mat4(1.0f);
-	float sign = 1.0f;
-	for (uint32_t i = 1; i < n_iters; ++i) {
-		result += (sign / (float)i) * cur;
-		cur *= (m - mat4(1.0f));
-		sign = -sign;
-	}
-	return result;
-}
-
-mat4 mat_log_hawkins(const mat4& m, float eps = 1e-10f) {
-	mat4 A = m - mat4(1.0f), Z = A, X = A;
-	for (uint32_t i = 2; i < 32; ++i) {
-		if (frobenius_norm(Z) < eps) {
-			return X;
-		}
-
-		Z = Z * A;
-		X += (1.0f / (float)i) * Z;
-	}
-
-	return X;
-}
-
-mat4 mat_exp_power(const mat4& m, uint32_t n_iters) {
-	mat4 result = mat4(1.0f);
-	mat4 cur = m;
-	float div = 1.0f;
-	for (uint32_t i = 1; i < n_iters; ++i) {
-		div *= (float)i;
-		result += (1.0f / div) * cur;
-		cur *= m;
-	}
-	return result;
-}
-
-mat4 mat_exp_pade(const mat4& m) {
-	// Pade approximation with scaling; same as Matlab.
-	// Pseudocode translated from Hawkins and Grimm [2007]
-	mat4 X = mat4(1.0f), D = mat4(1.0f), N = mat4(1.0f);
-	float c = 1.0f;
-	constexpr uint32_t q = 6; // Matlab's default when using this algorithm
-
-	float s = -1.0f;
-	for (uint32_t k = 1; k <= q; ++k) {
-		c = c * (q - k + 1) / (k * (2 * q - k + 1));
-		X = m * X;
-		auto cX = c * X;
-		N = N + cX;
-		D = D + s * cX;
-		s = -s;
-	}
-
-	return inverse(D) * N;
-}
-
-mat4 mat_log(const mat4& m) {
-	mat4 result(m);
-
-	uint32_t j = 0;
-	for (; j < 32; ++j) {
-		if (frobenius_norm(result - mat4(1.0f)) < 1e-5f) {
-			break;
-		}
-
-		result = mat_sqrt(result);
-	}
-
-	result = mat_log_hawkins(result);
-	return scalbnf(1.0f, j) * result;
-}
-
-mat4 mat_exp(const mat4& m) {
-	uint32_t N_SQUARING = max(0, 1 + (int)floor(log2(frobenius_norm(m))));
-
-	mat4 result = scalbnf(1.0f, -N_SQUARING) * m;
-	result = mat_exp_pade(result);
-
-	for (uint32_t i = 0; i < N_SQUARING; ++i) {
-		result *= result;
-	}
-
-	return result;
-}
-
-mat3 orthogonalize(const mat3& m) {
-	return mat3{
-		0.5f * (3.0f - dot(m[0], m[0])) * m[0],
-		0.5f * (3.0f - dot(m[1], m[1])) * m[1],
-		0.5f * (3.0f - dot(m[2], m[2])) * m[2],
-	};
-}
-
-mat4x3 camera_log_lerp(const mat4x3& a, const mat4x3& b, float t) {
-	return mat_exp(mat_log(mat4(b) * inverse(mat4(a))) * t) * mat4(a);
-}
-
-mat4x3 camera_slerp(const mat4x3& a, const mat4x3& b, float t) {
-	mat3 rot = slerp(a, b, t);
-	return {rot[0], rot[1], rot[2], mix(a[3], b[3], t)};
-}
-
-GPUMemory<float> load_exr_gpu(const fs::path& path, int* width, int* height) {
-	float* out; // width * height * RGBA
-	load_exr(&out, width, height, path.str().c_str());
-	ScopeGuard mem_guard{[&]() { free(out); }};
-
-	GPUMemory<float> result((*width) * (*height) * 4);
-	result.copy_from_host(out);
-	return result;
-}
-
-GPUMemory<float> load_stbi_gpu(const fs::path& path, int* width, int* height) {
-	bool is_hdr = is_hdr_stbi(path);
-
-	void* data; // width * height * RGBA
-	int comp;
-	if (is_hdr) {
-		data = load_stbi_float(path, width, height, &comp, 4);
-	} else {
-		data = load_stbi(path, width, height, &comp, 4);
-	}
-
-	if (!data) {
-		throw std::runtime_error{std::string{stbi_failure_reason()}};
-	}
-
-	ScopeGuard mem_guard{[&]() { stbi_image_free(data); }};
-
-	if (*width == 0 || *height == 0) {
-		throw std::runtime_error{"Image has zero pixels."};
-	}
-
-	GPUMemory<float> result((*width) * (*height) * 4);
-	if (is_hdr) {
-		result.copy_from_host((float*)data);
-	} else {
-		GPUMemory<uint8_t> bytes((*width) * (*height) * 4);
-		bytes.copy_from_host((uint8_t*)data);
-		linear_kernel(from_rgba32<float>, 0, nullptr, (*width) * (*height), bytes.data(), result.data(), false, false, 0);
-	}
-
-	return result;
-}
-
-NGP_NAMESPACE_END
diff --git a/src/common.cu b/src/common_host.cu
similarity index 76%
rename from src/common.cu
rename to src/common_host.cu
index b6c2ec6ab..2cacd34e8 100644
--- a/src/common.cu
+++ b/src/common_host.cu
@@ -8,11 +8,15 @@
  * license agreement from NVIDIA CORPORATION is strictly prohibited.
  */
 
-/** @file   common_device.cu
+/** @file   common_host.cu
  *  @author Thomas Müller, NVIDIA
  */
 
-#include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/bounding_box.cuh>
+#include <neural-graphics-primitives/common_device.cuh>
+#include <neural-graphics-primitives/common_host.h>
+#include <neural-graphics-primitives/tinyexr_wrapper.h>
+#include <neural-graphics-primitives/triangle.cuh>
 
 #include <tiny-cuda-nn/common.h>
 
@@ -21,7 +25,7 @@
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_WRITE_IMPLEMENTATION
 
-#ifdef __NVCC__
+#ifdef __CUDACC__
 #  ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #    pragma nv_diag_suppress 550
 #  else
@@ -30,7 +34,7 @@
 #endif
 #include <stb_image/stb_image.h>
 #include <stb_image/stb_image_write.h>
-#ifdef __NVCC__
+#ifdef __CUDACC__
 #  ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #    pragma nv_diag_default 550
 #  else
@@ -39,7 +43,7 @@
 #endif
 
 #ifdef _WIN32
-#  include <windows.h>
+#  include <Windows.h>
 #else
 #  include <unistd.h>
 #  include <linux/limits.h>
@@ -50,9 +54,7 @@
 #undef near
 #undef far
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 bool is_wsl() {
 #ifdef _WIN32
@@ -95,7 +97,7 @@ std::wstring native_string(const fs::path& path) { return path.wstr(); }
 std::string native_string(const fs::path& path) { return path.str(); }
 #endif
 
-fs::path get_executable_dir() {
+fs::path discover_executable_dir() {
 #ifdef _WIN32
 	WCHAR path[1024];
 	if (GetModuleFileNameW(NULL, path, 1024) == 0) {
@@ -112,8 +114,8 @@ fs::path get_executable_dir() {
 #endif
 }
 
-fs::path get_root_dir() {
-	auto executable_dir = get_executable_dir();
+fs::path discover_root_dir() {
+	auto executable_dir = discover_executable_dir();
 	fs::path exists_in_root_dir = "scripts";
 	for (const auto& candidate : {
 		fs::path{"."}/exists_in_root_dir,
@@ -248,4 +250,64 @@ FILE* native_fopen(const fs::path& path, const char* mode) {
 #endif
 }
 
-NGP_NAMESPACE_END
+GPUMemory<float> load_exr_gpu(const fs::path& path, int* width, int* height) {
+	float* out; // width * height * RGBA
+	load_exr(&out, width, height, path.str().c_str());
+	ScopeGuard mem_guard{[&]() { free(out); }};
+
+	GPUMemory<float> result((*width) * (*height) * 4);
+	result.copy_from_host(out);
+	return result;
+}
+
+GPUMemory<float> load_stbi_gpu(const fs::path& path, int* width, int* height) {
+	bool is_hdr = is_hdr_stbi(path);
+
+	void* data; // width * height * RGBA
+	int comp;
+	if (is_hdr) {
+		data = load_stbi_float(path, width, height, &comp, 4);
+	} else {
+		data = load_stbi(path, width, height, &comp, 4);
+	}
+
+	if (!data) {
+		throw std::runtime_error{std::string{stbi_failure_reason()}};
+	}
+
+	ScopeGuard mem_guard{[&]() { stbi_image_free(data); }};
+
+	if (*width == 0 || *height == 0) {
+		throw std::runtime_error{"Image has zero pixels."};
+	}
+
+	GPUMemory<float> result((*width) * (*height) * 4);
+	if (is_hdr) {
+		result.copy_from_host((float*)data);
+	} else {
+		GPUMemory<uint8_t> bytes((*width) * (*height) * 4);
+		bytes.copy_from_host((uint8_t*)data);
+		linear_kernel(from_rgba32<float>, 0, nullptr, (*width) * (*height), bytes.data(), result.data(), false, false, 0);
+	}
+
+	return result;
+}
+
+std::ostream& operator<<(std::ostream& os, const BoundingBox& bb) {
+	os << "[";
+	os << "min=[" << bb.min.x << "," << bb.min.y << "," << bb.min.z << "], ";
+	os << "max=[" << bb.max.x << "," << bb.max.y << "," << bb.max.z << "]";
+	os << "]";
+	return os;
+}
+
+std::ostream& operator<<(std::ostream& os, const Triangle& triangle) {
+	os << "[";
+	os << "a=[" << triangle.a.x << "," << triangle.a.y << "," << triangle.a.z << "], ";
+	os << "b=[" << triangle.b.x << "," << triangle.b.y << "," << triangle.b.z << "], ";
+	os << "c=[" << triangle.c.x << "," << triangle.c.y << "," << triangle.c.z << "]";
+	os << "]";
+	return os;
+}
+
+}
diff --git a/src/dlss.cu b/src/dlss.cu
index e498cce4f..d4c55428c 100644
--- a/src/dlss.cu
+++ b/src/dlss.cu
@@ -12,10 +12,10 @@
  *  @author Thomas Müller, NVIDIA
  */
 
-#include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/common_host.h>
 #include <neural-graphics-primitives/dlss.h>
 
-#include <tiny-cuda-nn/common.h>
+#include <tiny-cuda-nn/common_host.h>
 
 #include <filesystem/path.h>
 
@@ -36,7 +36,7 @@ static_assert(false, "DLSS can only be compiled when both Vulkan and GUI support
 
 // NGX's macro `NVSDK_NGX_FAILED` results in a change of sign, which does not affect correctness.
 // Thus, suppress the corresponding warning.
-#ifdef __NVCC__
+#ifdef __CUDACC__
 #  ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #    pragma nv_diag_suppress = integer_sign_change
 #  else
@@ -51,9 +51,7 @@ static_assert(false, "DLSS can only be compiled when both Vulkan and GUI support
 #include <codecvt>
 #include <locale>
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 extern std::atomic<size_t> g_total_n_bytes_allocated;
 
@@ -314,7 +312,7 @@ public:
 		};
 
 		cudaDeviceProp cuda_device_prop;
-		CUDA_CHECK_THROW(cudaGetDeviceProperties(&cuda_device_prop, tcnn::cuda_device()));
+		CUDA_CHECK_THROW(cudaGetDeviceProperties(&cuda_device_prop, cuda_device()));
 
 		auto is_same_as_cuda_device = [&](VkPhysicalDevice device) {
 			VkPhysicalDeviceIDProperties physical_device_id_properties = {};
@@ -1222,4 +1220,4 @@ std::unique_ptr<IDlss> VulkanAndNgx::init_dlss(const ivec2& out_resolution) {
 	return std::make_unique<Dlss>(shared_from_this(), out_resolution);
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/main.cu b/src/main.cu
index ac79bd362..3494f94ff 100644
--- a/src/main.cu
+++ b/src/main.cu
@@ -23,9 +23,8 @@
 using namespace args;
 using namespace ngp;
 using namespace std;
-using namespace tcnn;
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 int main_func(const std::vector<std::string>& arguments) {
 	ArgumentParser parser{
@@ -191,7 +190,7 @@ int main_func(const std::vector<std::string>& arguments) {
 	return 0;
 }
 
-NGP_NAMESPACE_END
+}
 
 #ifdef _WIN32
 int wmain(int argc, wchar_t* argv[]) {
diff --git a/src/marching_cubes.cu b/src/marching_cubes.cu
index daff134fd..67c934089 100644
--- a/src/marching_cubes.cu
+++ b/src/marching_cubes.cu
@@ -14,7 +14,7 @@
 
 #include <neural-graphics-primitives/bounding_box.cuh>
 #include <neural-graphics-primitives/common_device.cuh>
-#include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/common_host.h>
 #include <neural-graphics-primitives/random_val.cuh> // helpers to generate random values, directions
 #include <neural-graphics-primitives/thread_pool.h>
 
@@ -35,13 +35,11 @@
 
 #include <vector>
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 ivec3 get_marching_cubes_res(uint32_t res_1d, const BoundingBox &aabb) {
-	float scale = res_1d / compMax(aabb.max - aabb.min);
-	ivec3 res3d = (aabb.max - aabb.min) * scale + vec3(0.5f);
+	float scale = res_1d / max(aabb.max - aabb.min);
+	ivec3 res3d = (aabb.max - aabb.min) * scale + 0.5f;
 	res3d.x = next_multiple((unsigned int)res3d.x, 16u);
 	res3d.y = next_multiple((unsigned int)res3d.y, 16u);
 	res3d.z = next_multiple((unsigned int)res3d.z, 16u);
@@ -265,7 +263,7 @@ __global__ void gen_vertices(BoundingBox render_aabb, mat3 render_aabb_to_local,
 	uint32_t y = blockIdx.y * blockDim.y + threadIdx.y;
 	uint32_t z = blockIdx.z * blockDim.z + threadIdx.z;
 	if (x>=res_3d.x || y>=res_3d.y || z>=res_3d.z) return;
-	vec3 scale = (render_aabb.max - render_aabb.min) / vec3(res_3d - ivec3(1));
+	vec3 scale = (render_aabb.max - render_aabb.min) / vec3(res_3d - 1);
 	vec3 offset=render_aabb.min;
 	uint32_t res2=res_3d.x*res_3d.y;
 	uint32_t res3=res_3d.x*res_3d.y*res_3d.z;
@@ -352,7 +350,7 @@ __global__ void compute_centroids(uint32_t num_verts, vec3* centroids_out, const
 	if (i>=num_verts) return;
 	vec4 p = verts_in[i];
 	if (p.w<=0.f) return;
-	vec3 c = verts_in[i].xyz * (1.f / p.w);
+	vec3 c = verts_in[i].xyz() * (1.f / p.w);
 	centroids_out[i]=c;
 }
 
@@ -699,7 +697,7 @@ __global__ void gen_faces(ivec3 res_3d, const float* __restrict__ density, const
 	}
 }
 
-void compute_mesh_1ring(const tcnn::GPUMemory<vec3> &verts, const tcnn::GPUMemory<uint32_t> &indices, tcnn::GPUMemory<vec4> &output_pos, tcnn::GPUMemory<vec3> &output_normals) { // computes the average of the 1ring of all verts, as homogenous coordinates
+void compute_mesh_1ring(const GPUMemory<vec3> &verts, const GPUMemory<uint32_t> &indices, GPUMemory<vec4> &output_pos, GPUMemory<vec3> &output_normals) { // computes the average of the 1ring of all verts, as homogenous coordinates
 	output_pos.resize(verts.size());
 	output_pos.memset(0);
 	output_normals.resize(verts.size());
@@ -731,7 +729,7 @@ __global__ void compute_mesh_opt_gradients_kernel(
 		p.w = 1.f;
 	}
 
-	vec3 target = p.xyz * (1.0f / p.w);
+	vec3 target = p.xyz() * (1.0f / p.w);
 	vec3 smoothing_grad = src - target; // negative...
 
 	vec3 input_gradient = *(const vec3 *)(input_gradients + i * input_gradient_width);
@@ -743,9 +741,9 @@ __global__ void compute_mesh_opt_gradients_kernel(
 
 void compute_mesh_opt_gradients(
 	float thresh,
-	const tcnn::GPUMemory<vec3>& verts,
-	const tcnn::GPUMemory<vec3>& normals,
-	const tcnn::GPUMemory<vec4>& verts_smoothed,
+	const GPUMemory<vec3>& verts,
+	const GPUMemory<vec3>& normals,
+	const GPUMemory<vec4>& verts_smoothed,
 	const network_precision_t* densities,
 	uint32_t input_gradients_width,
 	const float* input_gradients,
@@ -773,7 +771,7 @@ void compute_mesh_opt_gradients(
 	);
 }
 
-void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 render_aabb_to_local, ivec3 res_3d, float thresh, const tcnn::GPUMemory<float>& density, tcnn::GPUMemory<vec3>& verts_out, tcnn::GPUMemory<uint32_t>& indices_out) {
+void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 render_aabb_to_local, ivec3 res_3d, float thresh, const GPUMemory<float>& density, GPUMemory<vec3>& verts_out, GPUMemory<uint32_t>& indices_out) {
 	GPUMemory<uint32_t> counters;
 
 	counters.enlarge(4);
@@ -794,10 +792,11 @@ void marching_cubes_gpu(cudaStream_t stream, BoundingBox render_aabb, mat3 rende
 	counters.copy_to_host(cpucounters);
 	tlog::info() << "#vertices=" << cpucounters[0] << " #triangles=" << (cpucounters[1]/3);
 
-	uint32_t n_verts=(cpucounters[0]+127)&~127; // round for later nn stuff
+	uint32_t n_verts = next_multiple(cpucounters[0], BATCH_SIZE_GRANULARITY); // round for later nn stuff
 	verts_out.resize(n_verts);
 	verts_out.memset(0);
 	indices_out.resize(cpucounters[1]);
+
 	// actually generate verts
 	gen_vertices<<<blocks, threads, 0>>>(render_aabb, render_aabb_to_local, res_3d, density.data(), vertex_grid, verts_out.data(), thresh, counters.data()+2);
 	gen_faces<<<blocks, threads, 0>>>(res_3d, density.data(), vertex_grid, indices_out.data(), thresh, counters.data()+2);
@@ -825,7 +824,7 @@ void save_mesh(
 	// Replace invalid values with reasonable defaults
 	for (size_t i = 0; i < cpuverts.size(); ++i) {
 		if (!all(isfinite(cpuverts[i]))) cpuverts[i] = vec3(0.0f);
-		if (!all(isfinite(cpunormals[i]))) cpunormals[i] = vec3(0.0f, 1.0f, 0.0f);
+		if (!all(isfinite(cpunormals[i]))) cpunormals[i] = vec3{0.0f, 1.0f, 0.0f};
 		if (!all(isfinite(cpucolors[i]))) cpucolors[i] = vec3(0.0f);
 	}
 
@@ -895,7 +894,7 @@ void save_mesh(
 			vec3 p = (cpuverts[i]-nerf_offset)/nerf_scale;
 			vec3 c = cpucolors[i];
 			vec3 n = normalize(cpunormals[i]);
-			unsigned char c8[3] = {(unsigned char)tcnn::clamp(c.x*255.f,0.f,255.f),(unsigned char)tcnn::clamp(c.y*255.f,0.f,255.f),(unsigned char)tcnn::clamp(c.z*255.f,0.f,255.f)};
+			unsigned char c8[3] = {(unsigned char)clamp(c.x*255.f,0.f,255.f),(unsigned char)clamp(c.y*255.f,0.f,255.f),(unsigned char)clamp(c.z*255.f,0.f,255.f)};
 			fprintf(f, "%0.5f %0.5f %0.5f %0.3f %0.3f %0.3f %d %d %d\n", p.x, p.y, p.z, n.x, n.y, n.z, c8[0], c8[1], c8[2]);
 		}
 
@@ -911,7 +910,7 @@ void save_mesh(
 		for (size_t i = 0; i < cpuverts.size(); ++i) {
 			vec3 p = (cpuverts[i]-nerf_offset)/nerf_scale;
 			vec3 c = cpucolors[i];
-			fprintf(f, "v %0.5f %0.5f %0.5f %0.3f %0.3f %0.3f\n", p.x, p.y, p.z, tcnn::clamp(c.x, 0.f, 1.f), tcnn::clamp(c.y, 0.f, 1.f), tcnn::clamp(c.z, 0.f, 1.f));
+			fprintf(f, "v %0.5f %0.5f %0.5f %0.3f %0.3f %0.3f\n", p.x, p.y, p.z, clamp(c.x, 0.f, 1.f), clamp(c.y, 0.f, 1.f), clamp(c.z, 0.f, 1.f));
 		}
 
 		for (auto &v: cpunormals) {
@@ -1012,9 +1011,9 @@ void save_density_grid_to_png(const GPUMemory<float>& density, const fs::path& p
 			int z = (u / res3d.x) + (v / res3d.y) * nacross;
 			if (z < res3d.z) {
 				if (swap_y_z) {
-					*dst++ = (uint8_t)tcnn::clamp((density_cpu[x + z*res3d.x + y*res3d.x*res3d.z]-thresh)*density_scale + 128.5f, 0.f, 255.f);
+					*dst++ = (uint8_t)clamp((density_cpu[x + z*res3d.x + y*res3d.x*res3d.z]-thresh)*density_scale + 128.5f, 0.f, 255.f);
 				} else {
-					*dst++ = (uint8_t)tcnn::clamp((density_cpu[x + (res3d.y-1-y)*res3d.x + z*res3d.x*res3d.y]-thresh)*density_scale + 128.5f, 0.f, 255.f);
+					*dst++ = (uint8_t)clamp((density_cpu[x + (res3d.y-1-y)*res3d.x + z*res3d.x*res3d.y]-thresh)*density_scale + 128.5f, 0.f, 255.f);
 				}
 			} else {
 				*dst++ = 0;
@@ -1057,10 +1056,10 @@ void save_rgba_grid_to_png_sequence(const GPUMemory<vec4>& rgba, const fs::path&
 		for (int y = 0; y < h; ++y) {
 			for (int x = 0; x < w; ++x) {
 				size_t i = swap_y_z ? (x + z*res3d.x + y*res3d.x*res3d.z) : (x + (res3d.y-1-y)*res3d.x + z*res3d.x*res3d.y);
-				*dst++ = (uint8_t)tcnn::clamp(rgba_cpu[i].x * 255.f, 0.f, 255.f);
-				*dst++ = (uint8_t)tcnn::clamp(rgba_cpu[i].y * 255.f, 0.f, 255.f);
-				*dst++ = (uint8_t)tcnn::clamp(rgba_cpu[i].z * 255.f, 0.f, 255.f);
-				*dst++ = (uint8_t)tcnn::clamp(rgba_cpu[i].w * 255.f, 0.f, 255.f);
+				*dst++ = (uint8_t)clamp(rgba_cpu[i].x * 255.f, 0.f, 255.f);
+				*dst++ = (uint8_t)clamp(rgba_cpu[i].y * 255.f, 0.f, 255.f);
+				*dst++ = (uint8_t)clamp(rgba_cpu[i].z * 255.f, 0.f, 255.f);
+				*dst++ = (uint8_t)clamp(rgba_cpu[i].w * 255.f, 0.f, 255.f);
 			}
 		}
 
@@ -1109,4 +1108,4 @@ void save_rgba_grid_to_raw_file(const GPUMemory<vec4>& rgba, const fs::path& pat
 	tlog::success() << "Wrote RGBA raw file to " << actual_path.str();
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/nerf_loader.cu b/src/nerf_loader.cu
index a61da2876..47fe4f738 100644
--- a/src/nerf_loader.cu
+++ b/src/nerf_loader.cu
@@ -31,14 +31,12 @@
 #include <cmath>
 #include <cstdlib>
 #include <fstream>
-#include <iostream>
 #include <string>
 #include <vector>
 
-using namespace tcnn;
 using namespace std::literals;
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 __global__ void convert_rgba32(const uint64_t num_pixels, const uint8_t* __restrict__ pixels, uint8_t* __restrict__ out, bool white_2_transparent = false, bool black_2_transparent = false, uint32_t mask_color = 0) {
 	const uint64_t i = threadIdx.x + blockIdx.x * blockDim.x;
@@ -168,8 +166,8 @@ NerfDataset create_empty_nerf_dataset(size_t n_images, int aabb_scale, bool is_h
 	result.is_hdr = is_hdr;
 	result.paths = std::vector<std::string>(n_images, "");
 	for (size_t i = 0; i < n_images; ++i) {
-		result.xforms[i].start = mat4x3(1.0f);
-		result.xforms[i].end = mat4x3(1.0f);
+		result.xforms[i].start = mat4x3::identity();
+		result.xforms[i].end = mat4x3::identity();
 	}
 	return result;
 }
@@ -353,6 +351,9 @@ NerfDataset load_nerf(const std::vector<fs::path>& jsonpaths, float sharpen_amou
 		for (auto&& frame : frames) {
 			// Compatibility with Windows paths on Linux. (Breaks linux filenames with "\\" in them, which is acceptable for us.)
 			frame["file_path"] = replace_all(frame["file_path"], "\\", "/");
+			if (frame.contains("depth_path")) {
+				frame["depth_path"] = replace_all(frame["depth_path"], "\\", "/");
+			}
 		}
 
 		if (json.contains("n_frames")) {
@@ -527,7 +528,7 @@ NerfDataset load_nerf(const std::vector<fs::path>& jsonpaths, float sharpen_amou
 			result.up[2] = float(json["up"][0]);
 		}
 
-		if (json.contains("envmap") && !any(equal(result.envmap_resolution, ivec2(0)))) {
+		if (json.contains("envmap") && product(result.envmap_resolution) > 0) {
 			fs::path envmap_path = resolve_path(base_path, json["envmap"]);
 			if (!envmap_path.exists()) {
 				throw std::runtime_error{fmt::format("Environment map {} does not exist.", envmap_path.str())};
@@ -587,7 +588,7 @@ NerfDataset load_nerf(const std::vector<fs::path>& jsonpaths, float sharpen_amou
 					}
 
 					tlog::success() << "Alpha loaded from " << alphapath;
-					for (int i = 0; i < compMul(dst.res); ++i) {
+					for (int i = 0; i < product(dst.res); ++i) {
 						img[i*4+3] = (uint8_t)(255.0f*srgb_to_linear(alpha_img[i*4]*(1.f/255.f))); // copy red channel of alpha to alpha.png to our alpha channel
 					}
 				}
@@ -606,7 +607,7 @@ NerfDataset load_nerf(const std::vector<fs::path>& jsonpaths, float sharpen_amou
 					}
 
 					dst.mask_color = 0x00FF00FF; // HOT PINK
-					for (int i = 0; i < compMul(dst.res); ++i) {
+					for (int i = 0; i < product(dst.res); ++i) {
 						if (mask_img[i*4] != 0 || mask_img[i*4+1] != 0 || mask_img[i*4+2] != 0) {
 							*(uint32_t*)&img[i*4] = dst.mask_color;
 						}
@@ -638,7 +639,7 @@ NerfDataset load_nerf(const std::vector<fs::path>& jsonpaths, float sharpen_amou
 
 			fs::path rayspath = path.parent_path() / fmt::format("rays_{}.dat", path.basename());
 			if (enable_ray_loading && rayspath.exists()) {
-				uint32_t n_pixels = compMul(dst.res);
+				uint32_t n_pixels = product(dst.res);
 				dst.rays = (Ray*)malloc(n_pixels * sizeof(Ray));
 
 				std::ifstream rays_file{native_string(rayspath), std::ios::binary};
@@ -664,11 +665,11 @@ NerfDataset load_nerf(const std::vector<fs::path>& jsonpaths, float sharpen_amou
 			nlohmann::json& jsonmatrix_end = frame.contains("transform_matrix_end") ? frame["transform_matrix_end"] : jsonmatrix_start;
 
 			if (frame.contains("driver_parameters")) {
-				vec3 light_dir(
+				vec3 light_dir{
 					frame["driver_parameters"].value("LightX", 0.f),
 					frame["driver_parameters"].value("LightY", 0.f),
 					frame["driver_parameters"].value("LightZ", 0.f)
-				);
+				};
 				result.metadata[i_img].light_dir = result.nerf_direction_to_ngp(normalize(light_dir));
 				result.has_light_dirs = true;
 				result.n_extra_learnable_dims = 0;
@@ -746,7 +747,7 @@ void NerfDataset::set_training_image(int frame_idx, const ivec2& image_resolutio
 		throw std::runtime_error{"NerfDataset::set_training_image: invalid frame index"};
 	}
 
-	size_t n_pixels = compMul(image_resolution);
+	size_t n_pixels = product(image_resolution);
 	size_t img_size = n_pixels * 4; // 4 channels
 	size_t image_type_stride = image_type_size(image_type);
 	// copy to gpu if we need to do a conversion
@@ -800,7 +801,7 @@ void NerfDataset::set_training_image(int frame_idx, const ivec2& image_resolutio
 	// apply requested sharpening
 	if (sharpen_amount > 0.f) {
 		if (image_type == EImageDataType::Byte) {
-			tcnn::GPUMemory<uint8_t> images_data_half(img_size * sizeof(__half));
+			GPUMemory<uint8_t> images_data_half(img_size * sizeof(__half));
 			linear_kernel(from_rgba32<__half>, 0, nullptr, n_pixels, (uint8_t*)pixels, (__half*)images_data_half.data(), white_transparent, black_transparent, mask_color);
 			pixelmemory[frame_idx] = std::move(images_data_half);
 			dst = pixelmemory[frame_idx].data();
@@ -809,7 +810,7 @@ void NerfDataset::set_training_image(int frame_idx, const ivec2& image_resolutio
 
 		assert(image_type == EImageDataType::Half || image_type == EImageDataType::Float);
 
-		tcnn::GPUMemory<uint8_t> images_data_sharpened(img_size * image_type_size(image_type));
+		GPUMemory<uint8_t> images_data_sharpened(img_size * image_type_size(image_type));
 
 		float center_w = 4.f + 1.f / sharpen_amount; // center_w ranges from 5 (strong sharpening) to infinite (no sharpening)
 		if (image_type == EImageDataType::Half) {
@@ -862,4 +863,4 @@ void NerfDataset::update_metadata(int first, int last) {
 	CUDA_CHECK_THROW(cudaMemcpy(metadata_gpu.data() + first, metadata.data() + first, n * sizeof(TrainingImageMetadata), cudaMemcpyHostToDevice));
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/openxr_hmd.cu b/src/openxr_hmd.cu
index 08353b772..a348e317d 100644
--- a/src/openxr_hmd.cu
+++ b/src/openxr_hmd.cu
@@ -15,8 +15,6 @@
  *          view, hand, and eye poses, as well as controller inputs.
  */
 
-#define NOMINMAX
-
 #include <neural-graphics-primitives/common_device.cuh>
 #include <neural-graphics-primitives/marching_cubes.h>
 #include <neural-graphics-primitives/openxr_hmd.h>
@@ -38,9 +36,7 @@
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers" //TODO: XR struct are uninitiaized apart from their type
 #endif
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 // function XrEnumStr turns enum into string for printing
 // uses expansion macro and data provided in openxr_reflection.h
@@ -1254,7 +1250,7 @@ void OpenXRHMD::end_frame(FrameInfoPtr frame_info, float znear, float zfar, bool
 	XR_CHECK_THROW(xrEndFrame(m_session, &frame_end_info));
 }
 
-NGP_NAMESPACE_END
+}
 
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
diff --git a/src/optix/pathescape.cu b/src/optix/pathescape.cu
index 8e5b8a2c5..13b711d66 100644
--- a/src/optix/pathescape.cu
+++ b/src/optix/pathescape.cu
@@ -15,13 +15,12 @@
 
 #include <neural-graphics-primitives/common_device.cuh>
 #include <neural-graphics-primitives/random_val.cuh>
+
 #include <optix.h>
 
 #include "pathescape.h"
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 extern "C" {
 	__constant__ PathEscape::Params params;
@@ -121,4 +120,4 @@ extern "C" __global__ void __closesthit__ch() {
 	optixSetPayload_0(optixGetPrimitiveIndex());
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/optix/pathescape.h b/src/optix/pathescape.h
index d75a08352..cbdf9e09b 100644
--- a/src/optix/pathescape.h
+++ b/src/optix/pathescape.h
@@ -20,7 +20,7 @@
 
 #include <optix.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 struct PathEscape {
 	struct Params {
@@ -35,4 +35,4 @@ struct PathEscape {
 	struct HitGroupData {};
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/src/optix/program.h b/src/optix/program.h
index 240c9497b..42302882a 100644
--- a/src/optix/program.h
+++ b/src/optix/program.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 #define OPTIX_CHECK_THROW(x)                                                                                 \
 	do {                                                                                                     \
@@ -34,7 +34,6 @@ NGP_NAMESPACE_BEGIN
 		}                                                                                                                                                 \
 	} while(0)
 
-
 namespace optix {
 	template <typename T>
 	struct SbtRecord {
@@ -236,4 +235,4 @@ namespace optix {
 	};
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/optix/raystab.cu b/src/optix/raystab.cu
index 593e02c14..412718a52 100644
--- a/src/optix/raystab.cu
+++ b/src/optix/raystab.cu
@@ -20,9 +20,7 @@
 
 #include "raystab.h"
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 extern "C" {
 	__constant__ Raystab::Params params;
@@ -78,4 +76,4 @@ extern "C" __global__ void __closesthit__ch() {
 	optixSetPayload_0(1);
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/optix/raystab.h b/src/optix/raystab.h
index db8fe99b9..0f11e930b 100644
--- a/src/optix/raystab.h
+++ b/src/optix/raystab.h
@@ -19,7 +19,7 @@
 
 #include <optix.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 struct Raystab {
 	struct Params {
@@ -33,4 +33,4 @@ struct Raystab {
 	struct HitGroupData {};
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/src/optix/raytrace.cu b/src/optix/raytrace.cu
index ea438c302..562748236 100644
--- a/src/optix/raytrace.cu
+++ b/src/optix/raytrace.cu
@@ -14,11 +14,12 @@
  */
 
 #include <neural-graphics-primitives/common_device.cuh>
+
 #include <optix.h>
 
 #include "raytrace.h"
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 extern "C" {
 	__constant__ Raytrace::Params params;
@@ -70,4 +71,4 @@ extern "C" __global__ void __closesthit__ch() {
 	optixSetPayload_1(__float_as_int(optixGetRayTmax()));
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/optix/raytrace.h b/src/optix/raytrace.h
index e7b406349..eaf43bd3f 100644
--- a/src/optix/raytrace.h
+++ b/src/optix/raytrace.h
@@ -20,7 +20,7 @@
 
 #include <optix.h>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 struct Raytrace {
 	struct Params {
@@ -35,4 +35,4 @@ struct Raytrace {
 	struct HitGroupData {};
 };
 
-NGP_NAMESPACE_END
+}
diff --git a/src/python_api.cu b/src/python_api.cu
index 0df6c001b..851d7c9c4 100644
--- a/src/python_api.cu
+++ b/src/python_api.cu
@@ -22,7 +22,7 @@
 #include <pybind11/functional.h>
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
-#include <pybind11_glm/pybind11_glm.hpp>
+#include <tiny-cuda-nn/vec_pybind11.h>
 #include <pybind11_json/pybind11_json.hpp>
 
 #include <filesystem/path.h>
@@ -37,14 +37,10 @@
 #  include <GLFW/glfw3.h>
 #endif
 
-using namespace tcnn;
 using namespace nlohmann;
 namespace py = pybind11;
 
-using namespace pybind11::literals; // to bring in the `_a` literal
-
-NGP_NAMESPACE_BEGIN
-
+namespace ngp {
 
 void Testbed::Nerf::Training::set_image(int frame_idx, pybind11::array_t<float> img, pybind11::array_t<float> depth_img, float depth_scale) {
 	if (frame_idx < 0 || frame_idx >= dataset.n_images) {
@@ -63,7 +59,7 @@ void Testbed::Nerf::Training::set_image(int frame_idx, pybind11::array_t<float>
 
 	py::buffer_info depth_buf = depth_img.request();
 
-	dataset.set_training_image(frame_idx, {img_buf.shape[1], img_buf.shape[0]}, (const void*)img_buf.ptr, (const float*)depth_buf.ptr, depth_scale, false, EImageDataType::Float, EDepthDataType::Float);
+	dataset.set_training_image(frame_idx, {(int)img_buf.shape[1], (int)img_buf.shape[0]}, (const void*)img_buf.ptr, (const float*)depth_buf.ptr, depth_scale, false, EImageDataType::Float, EDepthDataType::Float);
 }
 
 void Testbed::override_sdf_training_data(py::array_t<float> points, py::array_t<float> distances) {
@@ -99,7 +95,7 @@ void Testbed::override_sdf_training_data(py::array_t<float> points, py::array_t<
 }
 
 pybind11::dict Testbed::compute_marching_cubes_mesh(ivec3 res3d, BoundingBox aabb, float thresh) {
-	mat3 render_aabb_to_local = mat3(1.0f);
+	mat3 render_aabb_to_local = mat3::identity();
 	if (aabb.is_empty()) {
 		aabb = m_testbed_mode == ETestbedMode::Nerf ? m_render_aabb : m_aabb;
 		render_aabb_to_local = m_render_aabb_to_local;
@@ -121,7 +117,7 @@ pybind11::dict Testbed::compute_marching_cubes_mesh(ivec3 res3d, BoundingBox aab
 		ns[i] = normalize(ns[i]);
 	}
 
-	return py::dict("V"_a=cpuverts, "N"_a=cpunormals, "C"_a=cpucolors, "F"_a=cpuindices);
+	return py::dict(py::arg("V")=cpuverts, py::arg("N")=cpunormals, py::arg("C")=cpucolors, py::arg("F")=cpuindices);
 }
 
 py::array_t<float> Testbed::render_to_cpu(int width, int height, int spp, bool linear, float start_time, float end_time, float fps, float shutter_fraction) {
@@ -237,7 +233,7 @@ py::array_t<float> Testbed::view(bool linear, size_t view_idx) const {
 
 #ifdef NGP_GUI
 py::array_t<float> Testbed::screenshot(bool linear, bool front_buffer) const {
-	std::vector<float> tmp(compMul(m_window_res) * 4);
+	std::vector<float> tmp(product(m_window_res) * 4);
 	glReadBuffer(front_buffer ? GL_FRONT : GL_BACK);
 	glReadPixels(0, 0, m_window_res.x, m_window_res.y, GL_RGBA, GL_FLOAT, tmp.data());
 
@@ -266,7 +262,7 @@ py::array_t<float> Testbed::screenshot(bool linear, bool front_buffer) const {
 PYBIND11_MODULE(pyngp, m) {
 	m.doc() = "Instant neural graphics primitives";
 
-	m.def("free_temporary_memory", &tcnn::free_all_gpu_memory_arenas);
+	m.def("free_temporary_memory", &free_all_gpu_memory_arenas);
 
 	py::enum_<ETestbedMode>(m, "TestbedMode")
 		.value("Nerf", ETestbedMode::Nerf)
@@ -514,7 +510,6 @@ PYBIND11_MODULE(pyngp, m) {
 		.def_readwrite("screen_center", &Testbed::m_screen_center)
 		.def_readwrite("training_batch_size", &Testbed::m_training_batch_size)
 		.def("set_nerf_camera_matrix", &Testbed::set_nerf_camera_matrix)
-		.def("add_training_views_to_camera_path", &Testbed::add_training_views_to_camera_path)
 		.def("set_camera_to_training_view", &Testbed::set_camera_to_training_view)
 		.def("first_training_view", &Testbed::first_training_view)
 		.def("last_training_view", &Testbed::last_training_view)
@@ -566,7 +561,7 @@ PYBIND11_MODULE(pyngp, m) {
 		.def("crop_box_corners", &Testbed::crop_box_corners, py::arg("nerf_space") = true)
 		.def_property("root_dir",
 			[](py::object& obj) { return obj.cast<Testbed&>().root_dir().str(); },
-			[](const py::object& obj, const std::string& value) { obj.cast<Testbed&>().m_root_dir = value; }
+			[](const py::object& obj, const std::string& value) { obj.cast<Testbed&>().set_root_dir(value); }
 		)
 		;
 
@@ -579,7 +574,6 @@ PYBIND11_MODULE(pyngp, m) {
 		})
 		;
 
-
 	py::class_<Testbed::Nerf> nerf(testbed, "Nerf");
 	nerf
 		.def_readonly("training", &Testbed::Nerf::training)
@@ -597,6 +591,12 @@ PYBIND11_MODULE(pyngp, m) {
 		.def_readwrite("visualize_cameras", &Testbed::Nerf::visualize_cameras)
 		.def_readwrite("glow_y_cutoff", &Testbed::Nerf::glow_y_cutoff)
 		.def_readwrite("glow_mode", &Testbed::Nerf::glow_mode)
+		.def_readwrite("render_gbuffer_hard_edges", &Testbed::Nerf::render_gbuffer_hard_edges)
+		.def_readwrite("rendering_extra_dims_from_training_view", &Testbed::Nerf::rendering_extra_dims_from_training_view, "If non-negative, indicates the training view from which the extra dims are used. If -1, uses the values previously set by `set_rendering_extra_dims`.")
+		.def("find_closest_training_view", &Testbed::Nerf::find_closest_training_view, "Obtain the training view that is closest to the current camera.")
+		.def("set_rendering_extra_dims_from_training_view", &Testbed::Nerf::set_rendering_extra_dims_from_training_view, "Set the extra dims that are used for rendering to those that were trained for a given training view.")
+		.def("set_rendering_extra_dims", &Testbed::Nerf::set_rendering_extra_dims, "Set the extra dims that are used for rendering.")
+		.def("get_rendering_extra_dims", &Testbed::Nerf::get_rendering_extra_dims_cpu, "Get the extra dims that are currently used for rendering.")
 		;
 
 	py::class_<BRDFParams> brdfparams(m, "BRDFParams");
@@ -649,6 +649,7 @@ PYBIND11_MODULE(pyngp, m) {
 		.def_readwrite("depth_loss_type", &Testbed::Nerf::Training::depth_loss_type)
 		.def_readwrite("snap_to_pixel_centers", &Testbed::Nerf::Training::snap_to_pixel_centers)
 		.def_readwrite("optimize_extrinsics", &Testbed::Nerf::Training::optimize_extrinsics)
+		.def_readwrite("optimize_per_image_latents", &Testbed::Nerf::Training::optimize_extra_dims)
 		.def_readwrite("optimize_extra_dims", &Testbed::Nerf::Training::optimize_extra_dims)
 		.def_readwrite("optimize_exposure", &Testbed::Nerf::Training::optimize_exposure)
 		.def_readwrite("optimize_distortion", &Testbed::Nerf::Training::optimize_distortion)
@@ -667,6 +668,7 @@ PYBIND11_MODULE(pyngp, m) {
 		.def_readwrite("exposure_l2_reg", &Testbed::Nerf::Training::exposure_l2_reg)
 		.def_readwrite("depth_supervision_lambda", &Testbed::Nerf::Training::depth_supervision_lambda)
 		.def_readonly("dataset", &Testbed::Nerf::Training::dataset)
+		.def("get_extra_dims", &Testbed::Nerf::Training::get_extra_dims_cpu, "Get the extra dims (including trained latent code) for a specified training view.")
 		.def("set_camera_intrinsics", &Testbed::Nerf::Training::set_camera_intrinsics,
 			py::arg("frame_idx"),
 			py::arg("fx")=0.f, py::arg("fy")=0.f,
@@ -728,4 +730,4 @@ PYBIND11_MODULE(pyngp, m) {
 		;
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/render_buffer.cu b/src/render_buffer.cu
index 433bb6d52..0ba1fad1e 100644
--- a/src/render_buffer.cu
+++ b/src/render_buffer.cu
@@ -33,9 +33,7 @@
 
 #include <stb_image/stb_image.h>
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 extern std::atomic<size_t> g_total_n_bytes_allocated;
 
@@ -46,7 +44,7 @@ void CudaSurface2D::free() {
 	m_surface = 0;
 	if (m_array) {
 		cudaFreeArray(m_array);
-		g_total_n_bytes_allocated -= compMul(m_size) * sizeof(float) * m_n_channels;
+		g_total_n_bytes_allocated -= product(m_size) * sizeof(float) * m_n_channels;
 	}
 	m_array = nullptr;
 	m_size = ivec2(0);
@@ -70,7 +68,7 @@ void CudaSurface2D::resize(const ivec2& size, int n_channels) {
 	}
 	CUDA_CHECK_THROW(cudaMallocArray(&m_array, &desc, size.x, size.y, cudaArraySurfaceLoadStore));
 
-	g_total_n_bytes_allocated += compMul(m_size) * sizeof(float) * n_channels;
+	g_total_n_bytes_allocated += product(m_size) * sizeof(float) * n_channels;
 
 	struct cudaResourceDesc resource_desc;
 	memset(&resource_desc, 0, sizeof(resource_desc));
@@ -198,7 +196,7 @@ GLTexture::CUDAMapping::CUDAMapping(GLuint texture_id, const ivec2& size, int n_
 		// falling back to a regular cuda surface + CPU copy of data
 		m_cuda_surface = std::make_unique<CudaSurface2D>();
 		m_cuda_surface->resize(size, n_channels);
-		m_data_cpu.resize(compMul(m_size) * n_channels);
+		m_data_cpu.resize(product(m_size) * n_channels);
 		return;
 	}
 
@@ -253,10 +251,10 @@ __global__ void accumulate_kernel(ivec2 resolution, vec4* frame_buffer, vec4* ac
 				break;
 			}
 		case EColorSpace::SRGB:
-			color.rgb = linear_to_srgb(color.rgb);
+			color.rgb() = linear_to_srgb(color.rgb());
 			// fallthrough is intended!
 		case EColorSpace::Linear:
-			tmp.rgb = (tmp.rgb * sample_count + color.rgb) / (sample_count+1); break;
+			tmp.rgb() = (tmp.rgb() * sample_count + color.rgb()) / (sample_count+1); break;
 	}
 
 	tmp.a = (tmp.a * sample_count + color.a) / (sample_count+1);
@@ -307,7 +305,7 @@ __device__ vec3 tonemap(vec3 x, ETonemapCurve curve) {
 		k3 = 4.0f * k3;
 		k4 = 2.0f * k4;
 	} else { //if (curve == ETonemapCurve::Reinhard)
-		const vec3 luminance_coefficients = vec3(0.2126f, 0.7152f, 0.0722f);
+		const vec3 luminance_coefficients = {0.2126f, 0.7152f, 0.0722f};
 		float Y = dot(luminance_coefficients, x);
 
 		return x * (1.f / (Y + 1.0f));
@@ -392,20 +390,20 @@ __global__ void overlay_image_kernel(
 	// The background color is represented in SRGB, so convert
 	// to linear if that's not the space in which we're rendering.
 	if (color_space != EColorSpace::SRGB) {
-		background_color.xyz = srgb_to_linear(background_color.xyz);
+		background_color.xyz() = srgb_to_linear(background_color.xyz());
 	} else {
 		if (color.a > 0) {
-			color.rgb = linear_to_srgb(color.rgb() / color.a) * color.a;
+			color.rgb() = linear_to_srgb(color.rgb() / color.a) * color.a;
 		} else {
-			color.rgb = vec3(0.0f);
+			color.rgb() = vec3(0.0f);
 		}
 	}
 
 	float weight = (1 - color.a) * background_color.a;
-	color.rgb += background_color.rgb * weight;
+	color.rgb() += background_color.rgb() * weight;
 	color.a += weight;
 
-	color.rgb = tonemap(color.rgb, exposure, tonemap_curve, color_space, output_color_space);
+	color.rgb() = tonemap(color.rgb(), exposure, tonemap_curve, color_space, output_color_space);
 
 	vec4 prev_color;
 	surf2Dread((float4*)&prev_color, surface, x * sizeof(float4), y);
@@ -414,20 +412,20 @@ __global__ void overlay_image_kernel(
 }
 
 __device__ vec3 colormap_turbo(float x) {
-	const vec4 kRedVec4 =   vec4(0.13572138f, 4.61539260f, -42.66032258f, 132.13108234f);
-	const vec4 kGreenVec4 = vec4(0.09140261f, 2.19418839f, 4.84296658f, -14.18503333f);
-	const vec4 kBlueVec4 =  vec4(0.10667330f, 12.64194608f, -60.58204836f, 110.36276771f);
-	const vec2 kRedVec2 =   vec2(-152.94239396f, 59.28637943f);
-	const vec2 kGreenVec2 = vec2(4.27729857f, 2.82956604f);
-	const vec2 kBlueVec2 =  vec2(-89.90310912f, 27.34824973f);
+	const vec4 kRedVec4 =   {0.13572138f, 4.61539260f, -42.66032258f, 132.13108234f};
+	const vec4 kGreenVec4 = {0.09140261f, 2.19418839f, 4.84296658f, -14.18503333f};
+	const vec4 kBlueVec4 =  {0.10667330f, 12.64194608f, -60.58204836f, 110.36276771f};
+	const vec2 kRedVec2 =   {-152.94239396f, 59.28637943f};
+	const vec2 kGreenVec2 = {4.27729857f, 2.82956604f};
+	const vec2 kBlueVec2 =  {-89.90310912f, 27.34824973f};
 
 	x = __saturatef(x);
-	vec4 v4 = vec4{ 1.0f, x, x * x, x * x * x };
-	vec2 v2 = vec2{ v4.w * x, v4.w * v4.z };
-	return vec3{
+	vec4 v4 = { 1.0f, x, x * x, x * x * x };
+	vec2 v2 = { v4.w * x, v4.w * v4.z };
+	return {
 		dot(v4, kRedVec4)   + dot(v2, kRedVec2),
 		dot(v4, kGreenVec4) + dot(v2, kGreenVec2),
-		dot(v4, kBlueVec4)  + dot(v2, kBlueVec2)
+		dot(v4, kBlueVec4)  + dot(v2, kBlueVec2),
 	};
 }
 
@@ -504,8 +502,8 @@ __global__ void overlay_false_color_kernel(ivec2 resolution, ivec2 training_reso
 	float scale = training_resolution[fov_axis] / float(resolution[fov_axis]);
 	float u = (x+0.5f-resolution.x*0.5f) * scale + training_resolution.x*0.5f;
 	float v = (y+0.5f-resolution.y*0.5f) * scale + training_resolution.y*0.5f;
-	int srcx = floorf(u * error_map_resolution.x / float(max(1.f, (float)training_resolution.x)));
-	int srcy = floorf(v * error_map_resolution.y / float(max(1.f, (float)training_resolution.y)));
+	int srcx = floor(u * error_map_resolution.x / float(max(1.f, (float)training_resolution.x)));
+	int srcy = floor(v * error_map_resolution.y / float(max(1.f, (float)training_resolution.y)));
 
 	uint32_t srcidx = srcx + error_map_resolution.x * srcy;
 
@@ -541,18 +539,18 @@ __global__ void tonemap_kernel(ivec2 resolution, float exposure, vec4 background
 	// The background color is represented in SRGB, so convert
 	// to linear if that's not the space in which we're rendering.
 	if (color_space != EColorSpace::SRGB) {
-		background_color.rgb = srgb_to_linear(background_color.rgb);
+		background_color.rgb() = srgb_to_linear(background_color.rgb());
 	}
 
 	vec4 color = accumulate_buffer[idx];
 	float weight = (1 - color.a) * background_color.a;
-	color.rgb += background_color.rgb * weight;
+	color.rgb() += background_color.rgb() * weight;
 	color.a += weight;
 
-	color.rgb = tonemap(color.rgb, vec3(exposure), tonemap_curve, color_space, output_color_space);
+	color.rgb() = tonemap(color.rgb(), vec3(exposure), tonemap_curve, color_space, output_color_space);
 
 	if (unmultiply_alpha && color.a > 0.0f) {
-		color.rgb = color.rgb() / color.a;
+		color.rgb() = color.rgb() / color.a;
 	}
 
 	if (clamp_output_color) {
@@ -603,7 +601,7 @@ __global__ void depth_splat_kernel(
 }
 
 void CudaRenderBufferView::clear(cudaStream_t stream) const {
-	size_t n_pixels = compMul(resolution);
+	size_t n_pixels = product(resolution);
 	CUDA_CHECK_THROW(cudaMemsetAsync(frame_buffer, 0, n_pixels * sizeof(vec4), stream));
 	CUDA_CHECK_THROW(cudaMemsetAsync(depth_buffer, 0, n_pixels * sizeof(float), stream));
 }
@@ -791,4 +789,4 @@ void CudaRenderBuffer::disable_dlss() {
 	m_dlss = nullptr;
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/testbed.cu b/src/testbed.cu
index ce10a0385..78b000cc6 100644
--- a/src/testbed.cu
+++ b/src/testbed.cu
@@ -12,8 +12,8 @@
  *  @author Thomas Müller & Alex Evans, NVIDIA
  */
 
-#include <neural-graphics-primitives/common_device.cuh>
 #include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/common_device.cuh>
 #include <neural-graphics-primitives/json_binding.h>
 #include <neural-graphics-primitives/marching_cubes.h>
 #include <neural-graphics-primitives/nerf_loader.h>
@@ -28,8 +28,8 @@
 
 #include <tiny-cuda-nn/encodings/grid.h>
 #include <tiny-cuda-nn/loss.h>
-#include <tiny-cuda-nn/network_with_input_encoding.h>
 #include <tiny-cuda-nn/network.h>
+#include <tiny-cuda-nn/network_with_input_encoding.h>
 #include <tiny-cuda-nn/optimizer.h>
 #include <tiny-cuda-nn/trainer.h>
 
@@ -45,18 +45,18 @@
 #include <unordered_set>
 
 #ifdef NGP_GUI
-#  include <imgui/imgui.h>
-#  include <imgui/backends/imgui_impl_glfw.h>
-#  include <imgui/backends/imgui_impl_opengl3.h>
-#  include <imguizmo/ImGuizmo.h>
-#  ifdef _WIN32
-#    include <GL/gl3w.h>
-#  else
-#    include <GL/glew.h>
-#  endif
-#  include <GLFW/glfw3.h>
-#  include <GLFW/glfw3native.h>
-#  include <cuda_gl_interop.h>
+#	include <imgui/backends/imgui_impl_glfw.h>
+#	include <imgui/backends/imgui_impl_opengl3.h>
+#	include <imgui/imgui.h>
+#	include <imguizmo/ImGuizmo.h>
+#	ifdef _WIN32
+#		include <GL/gl3w.h>
+#	else
+#		include <GL/glew.h>
+#	endif
+#	include <GLFW/glfw3.h>
+#	include <GLFW/glfw3native.h>
+#	include <cuda_gl_interop.h>
 
 #endif
 
@@ -68,9 +68,8 @@
 
 
 using namespace std::literals::chrono_literals;
-using namespace tcnn;
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 int do_system(const std::string& cmd) {
 #ifdef _WIN32
@@ -358,7 +357,7 @@ void Testbed::load_file(const fs::path& path) {
 			// want to immediately start training on that data. So: go for it.
 			m_train = true;
 		}
-	} catch (std::runtime_error& e) {
+	} catch (const std::runtime_error& e) {
 		tlog::error() << "Failed to load training data: " << e.what();
 	}
 }
@@ -443,19 +442,6 @@ void Testbed::next_training_view() {
 	set_camera_to_training_view(m_nerf.training.view);
 }
 
-void Testbed::add_training_views_to_camera_path() {
-	for (int i = 0; i < m_nerf.training.dataset.n_images; ++i) {
-		int n = std::max(0, int(m_camera_path.keyframes.size()) - 1);
-		auto camera = get_xform_given_rolling_shutter(m_nerf.training.transforms[i], m_nerf.training.dataset.metadata[i].rolling_shutter, vec2{0.5f, 0.5f}, 0.0f);
-		int j = (int) ceil(m_camera_path.play_time * (float) n + 0.001f);
-		if (j > m_camera_path.keyframes.size()) j = m_camera_path.keyframes.size();
-		if (j < 0) j = 0;
-		m_camera_path.keyframes.insert(m_camera_path.keyframes.begin() + j, CameraKeyframe(camera, m_slice_plane_z, m_scale, fov(), m_aperture_size, m_nerf.glow_mode, m_nerf.glow_y_cutoff));
-		n = std::max(0, int(m_camera_path.keyframes.size()) - 1);
-		m_camera_path.play_time = n ? float(j) / float(n) : 1.f;
-	}
-}
-
 void Testbed::set_camera_to_training_view(int trainview) {
 	auto old_look_at = look_at();
 	m_camera = m_smoothed_camera = get_xform_given_rolling_shutter(m_nerf.training.transforms[trainview], m_nerf.training.dataset.metadata[trainview].rolling_shutter, vec2{0.5f, 0.5f}, 0.0f);
@@ -487,11 +473,11 @@ void Testbed::reset_camera() {
 		m_scale = 1.5f;
 	}
 
-	m_camera = transpose(mat3x4(
+	m_camera = transpose(mat3x4{
 		1.0f, 0.0f, 0.0f, 0.5f,
 		0.0f, -1.0f, 0.0f, 0.5f,
 		0.0f, 0.0f, -1.0f, 0.5f
-	));
+	});
 
 	m_camera[3] -= m_scale * view_dir();
 
@@ -509,7 +495,7 @@ void Testbed::set_train(bool mtrain) {
 }
 
 void Testbed::compute_and_save_marching_cubes_mesh(const fs::path& filename, ivec3 res3d , BoundingBox aabb, float thresh, bool unwrap_it) {
-	mat3 render_aabb_to_local = mat3(1.0f);
+	mat3 render_aabb_to_local = mat3::identity();
 	if (aabb.is_empty()) {
 		aabb = m_testbed_mode == ETestbedMode::Nerf ? m_render_aabb : m_aabb;
 		render_aabb_to_local = m_render_aabb_to_local;
@@ -519,7 +505,7 @@ void Testbed::compute_and_save_marching_cubes_mesh(const fs::path& filename, ive
 }
 
 ivec3 Testbed::compute_and_save_png_slices(const fs::path& filename, int res, BoundingBox aabb, float thresh, float density_range, bool flip_y_and_z_axes) {
-	mat3 render_aabb_to_local = mat3(1.0f);
+	mat3 render_aabb_to_local = mat3::identity();
 	if (aabb.is_empty()) {
 		aabb = m_testbed_mode == ETestbedMode::Nerf ? m_render_aabb : m_aabb;
 		render_aabb_to_local = m_render_aabb_to_local;
@@ -547,12 +533,16 @@ ivec3 Testbed::compute_and_save_png_slices(const fs::path& filename, int res, Bo
 
 fs::path Testbed::root_dir() {
 	if (m_root_dir.empty()) {
-		m_root_dir = get_root_dir();
+		set_root_dir(discover_root_dir());
 	}
 
 	return m_root_dir;
 }
 
+void Testbed::set_root_dir(const fs::path& dir) {
+	m_root_dir = dir;
+}
+
 inline float linear_to_db(float x) {
 	return -10.f*logf(x)/logf(10.f);
 }
@@ -619,8 +609,10 @@ void Testbed::set_crop_box(mat4x3 m, bool nerf_space) {
 	if (nerf_space) {
 		m = m_nerf.training.dataset.nerf_matrix_to_ngp(m, true);
 	}
-	vec3 radius(length(m[0]), length(m[1]), length(m[2]));
+
+	vec3 radius{length(m[0]), length(m[1]), length(m[2])};
 	vec3 cen(m[3]);
+
 	m_render_aabb_to_local = row(m_render_aabb_to_local, 0, m[0] / radius.x);
 	m_render_aabb_to_local = row(m_render_aabb_to_local, 1, m[1] / radius.y);
 	m_render_aabb_to_local = row(m_render_aabb_to_local, 2, m[2] / radius.z);
@@ -633,7 +625,7 @@ std::vector<vec3> Testbed::crop_box_corners(bool nerf_space) const {
 	mat4x3 m = crop_box(nerf_space);
 	std::vector<vec3> rv(8);
 	for (int i = 0; i < 8; ++i) {
-		rv[i] = m * vec4((i & 1) ? 1.f : -1.f, (i & 2) ? 1.f : -1.f, (i & 4) ? 1.f : -1.f, 1.f);
+		rv[i] = m * vec4{(i & 1) ? 1.f : -1.f, (i & 2) ? 1.f : -1.f, (i & 4) ? 1.f : -1.f, 1.f};
 		/* debug print out corners to check math is all lined up */
 		if (0) {
 			tlog::info() << rv[i].x << "," << rv[i].y << "," << rv[i].z << " [" << i << "]";
@@ -682,7 +674,7 @@ void Testbed::imgui() {
 				fov(),
 				m_aperture_size,
 				m_bounding_radius,
-				!m_nerf.training.dataset.xforms.empty() ? m_nerf.training.dataset.xforms[0].start : mat4x3(1.0f),
+				!m_nerf.training.dataset.xforms.empty() ? m_nerf.training.dataset.xforms[0].start : mat4x3::identity(),
 				m_nerf.glow_mode,
 				m_nerf.glow_y_cutoff
 			)) {
@@ -964,7 +956,7 @@ void Testbed::imgui() {
 		ImGui::DragInt("Seed", (int*)&m_seed, 1.0f, 0, std::numeric_limits<int>::max());
 		ImGui::PopItemWidth();
 
-		m_training_batch_size = next_multiple(m_training_batch_size, batch_size_granularity);
+		m_training_batch_size = next_multiple(m_training_batch_size, BATCH_SIZE_GRANULARITY);
 
 		if (m_train) {
 			std::vector<std::string> timings;
@@ -1161,8 +1153,8 @@ void Testbed::imgui() {
 			set_exposure(m_exposure);
 		}
 
-		float max_diam = compMax(m_aabb.max - m_aabb.min);
-		float render_diam = compMax(m_render_aabb.max - m_render_aabb.min);
+		float max_diam = max(m_aabb.max - m_aabb.min);
+		float render_diam = max(m_render_aabb.max - m_render_aabb.min);
 		float old_render_diam = render_diam;
 
 		if (m_testbed_mode == ETestbedMode::Nerf || m_testbed_mode == ETestbedMode::Volume) {
@@ -1218,7 +1210,7 @@ void Testbed::imgui() {
 				ImGui::Separator();
 				vec3 diag = m_render_aabb.diag();
 				bool edit_diag = false;
-				float max_diag = compMax(m_aabb.diag());
+				float max_diag = max(m_aabb.diag());
 				edit_diag |= ImGui::SliderFloat("Size x", ((float*)&diag)+0, 0.001f, max_diag, "%.3f");
 				edit_diag |= ImGui::SliderFloat("Size y", ((float*)&diag)+1, 0.001f, max_diag, "%.3f");
 				edit_diag |= ImGui::SliderFloat("Size z", ((float*)&diag)+2, 0.001f, max_diag, "%.3f");
@@ -1231,14 +1223,14 @@ void Testbed::imgui() {
 				if (ImGui::Button("Reset crop box")) {
 					accum_reset = true;
 					m_render_aabb = m_aabb;
-					m_render_aabb_to_local = mat3(1.0f);
+					m_render_aabb_to_local = mat3::identity();
 				}
 
 				ImGui::SameLine();
 				if (ImGui::Button("rotation only")) {
 					accum_reset = true;
 					vec3 world_cen = transpose(m_render_aabb_to_local) * m_render_aabb.center();
-					m_render_aabb_to_local = mat3(1.0f);
+					m_render_aabb_to_local = mat3::identity();
 					vec3 new_cen = m_render_aabb_to_local * world_cen;
 					vec3 old_cen = m_render_aabb.center();
 					m_render_aabb.min += new_cen - old_cen;
@@ -1287,14 +1279,15 @@ void Testbed::imgui() {
 			}
 
 			if (m_nerf.training.dataset.n_extra_learnable_dims) {
-				accum_reset |= ImGui::SliderInt("training image latent code for inference", (int*)&m_nerf.extra_dim_idx_for_inference, 0, m_nerf.training.dataset.n_images-1);
+				accum_reset |= ImGui::SliderInt("Rendering extra dims from training view", (int*)&m_nerf.rendering_extra_dims_from_training_view, -1, m_nerf.training.dataset.n_images-1);
 			}
 
+			accum_reset |= ImGui::Checkbox("Gbuffer hard edges", &m_nerf.render_gbuffer_hard_edges);
+
 			accum_reset |= ImGui::Combo("Groundtruth render mode", (int*)&m_ground_truth_render_mode, GroundTruthRenderModeStr);
 			accum_reset |= ImGui::SliderFloat("Groundtruth alpha", &m_ground_truth_alpha, 0.0f, 1.0f, "%.02f", ImGuiSliderFlags_AlwaysClamp);
 
 			bool lens_changed = ImGui::Checkbox("Apply lens distortion", &m_nerf.render_with_lens_distortion);
-
 			if (m_nerf.render_with_lens_distortion) {
 				lens_changed |= ImGui::Combo("Lens mode", (int*)&m_nerf.render_lens.mode, LensModeStr);
 				if (m_nerf.render_lens.mode == ELensMode::OpenCV) {
@@ -1320,10 +1313,10 @@ void Testbed::imgui() {
 				if (lens_changed && !supports_dlss(m_nerf.render_lens.mode)) {
 					m_dlss = false;
 				}
-
-				accum_reset |= lens_changed;
 			}
 
+			accum_reset |= lens_changed;
+
 			accum_reset |= ImGui::SliderFloat("Min transmittance", &m_nerf.render_min_transmittance, 0.0f, 1.0f, "%.3f", ImGuiSliderFlags_Logarithmic | ImGuiSliderFlags_NoRoundToFormat);
 			ImGui::TreePop();
 		}
@@ -1397,10 +1390,6 @@ void Testbed::imgui() {
 			}
 
 			if (m_testbed_mode == ETestbedMode::Nerf) {
-				if (ImGui::Button("Add training views to camera path")) {
-					add_training_views_to_camera_path();
-				}
-
 				if (ImGui::Button("First")) {
 					first_training_view();
 				}
@@ -1546,7 +1535,7 @@ void Testbed::imgui() {
 		if (ImGui::Button("Save")) {
 			try {
 				save_snapshot(m_imgui.snapshot_path, m_include_optimizer_state_in_snapshot, m_compress_snapshot);
-			} catch (std::exception& e) {
+			} catch (const std::exception& e) {
 				imgui_error_string = fmt::format("Failed to save snapshot: {}", e.what());
 				ImGui::OpenPopup("Error");
 			}
@@ -1555,7 +1544,7 @@ void Testbed::imgui() {
 		if (ImGui::Button("Load")) {
 			try {
 				load_snapshot(m_imgui.snapshot_path);
-			} catch (std::exception& e) {
+			} catch (const std::exception& e) {
 				imgui_error_string = fmt::format("Failed to load snapshot: {}", e.what());
 				ImGui::OpenPopup("Error");
 			}
@@ -1630,7 +1619,7 @@ void Testbed::imgui() {
 					auto effective_view_dir = flip_y_and_z_axes ? vec3{0.0f, 1.0f, 0.0f} : vec3{0.0f, 0.0f, 1.0f};
 					auto old_local = m_render_aabb_to_local;
 					auto old_aabb = m_render_aabb;
-					m_render_aabb_to_local = mat3(1.0f);
+					m_render_aabb_to_local = mat3::identity();
 					auto dir = m_data_path.is_directory() || m_data_path.empty() ? (m_data_path / "volume_raw") : (m_data_path.parent_path() / fmt::format("{}_volume_raw", m_data_path.filename()));
 					if (!dir.exists()) {
 						fs::create_directory(dir);
@@ -1687,7 +1676,7 @@ void Testbed::imgui() {
 			accum_reset |= ImGui::SliderFloat("Clearcoat", &m_sdf.brdf.clearcoat, 0.f, 1.f);
 			accum_reset |= ImGui::SliderFloat("Clearcoat gloss", &m_sdf.brdf.clearcoat_gloss, 0.f, 1.f);
 		}
-		m_sdf.brdf.ambientcolor = (m_background_color * m_background_color).rgb;
+		m_sdf.brdf.ambientcolor = (m_background_color * m_background_color).rgb();
 	}
 
 	if (ImGui::CollapsingHeader("Histograms of encoding parameters")) {
@@ -1704,11 +1693,11 @@ void Testbed::imgui() {
 
 
 		// Hashgrid statistics
-		for (int i = 0; i < m_n_levels; ++i) {
+		for (uint32_t i = 0; i < m_n_levels; ++i) {
 			f[i] = m_level_stats[i].mean();
 		}
 		ImGui::PlotHistogram("Grid means", f.data(), m_n_levels, 0, "means", FLT_MAX, FLT_MAX, ImVec2(0, 60.f));
-		for (int i = 0; i < m_n_levels; ++i) {
+		for (uint32_t i = 0; i < m_n_levels; ++i) {
 			f[i] = m_level_stats[i].sigma();
 		}
 		ImGui::PlotHistogram("Grid sigmas", f.data(), m_n_levels, 0, "sigma", FLT_MAX, FLT_MAX, ImVec2(0, 60.f));
@@ -1716,7 +1705,7 @@ void Testbed::imgui() {
 
 
 		// Histogram of trained hashgrid params
-		ImGui::SliderInt("Show details for level", &m_histo_level, 0, m_n_levels - 1);
+		ImGui::SliderInt("Show details for level", (int*)&m_histo_level, 0, m_n_levels - 1);
 		if (m_histo_level < m_n_levels) {
 			LevelStats& s = m_level_stats[m_histo_level];
 			static bool excludezero = false;
@@ -1775,12 +1764,12 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix)
 
 	float xyscale = (float)m_window_res[m_fov_axis];
 	vec2 screen_center = render_screen_center(m_screen_center);
-	mat4 view2proj = transpose(mat4(
+	mat4 view2proj = transpose(mat4{
 		xyscale, 0.0f,    (float)m_window_res.x*screen_center.x * zscale, 0.0f,
 		0.0f,    xyscale, (float)m_window_res.y*screen_center.y * zscale, 0.0f,
 		0.0f,    0.0f,    1.0f,                                           0.0f,
-		0.0f,    0.0f,    zscale,                                         0.0f
-	));
+		0.0f,    0.0f,    zscale,                                         0.0f,
+	});
 
 	mat4 world2proj = view2proj * world2view;
 	float aspect = (float)m_window_res.x / (float)m_window_res.y;
@@ -1793,7 +1782,7 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix)
 	}
 
 	if (m_visualize_unit_cube) {
-		visualize_cube(list, world2proj, vec3(0.f), vec3(1.f), mat3(1.0f));
+		visualize_cube(list, world2proj, vec3(0.f), vec3(1.f), mat3::identity());
 	}
 
 	if (m_edit_render_aabb) {
@@ -1806,17 +1795,17 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix)
 		float fly = focal.y;
 		float zfar = m_ndc_zfar;
 		float znear = m_ndc_znear;
-		mat4 view2proj_guizmo = transpose(mat4(
+		mat4 view2proj_guizmo = transpose(mat4{
 			fly * 2.0f / aspect, 0.0f,       0.0f,                            0.0f,
 			0.0f,                -fly * 2.f, 0.0f,                            0.0f,
 			0.0f,                0.0f,       (zfar + znear) / (zfar - znear), -(2.0f * zfar * znear) / (zfar - znear),
-			0.0f,                0.0f,       1.0f,                            0.0f
-		));
+			0.0f,                0.0f,       1.0f,                            0.0f,
+		});
 
 		ImGuizmo::SetRect(0, 0, io.DisplaySize.x, io.DisplaySize.y);
 
-		static mat4 matrix = mat4(1.0f);
-		static mat4 world2view_guizmo = mat4(1.0f);
+		static mat4 matrix = mat4::identity();
+		static mat4 world2view_guizmo = mat4::identity();
 
 		vec3 cen = transpose(m_render_aabb_to_local) * m_render_aabb.center();
 		if (!ImGuizmo::IsUsing()) {
@@ -1834,7 +1823,6 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix)
 		auto prev_matrix = matrix;
 
 		if (ImGuizmo::Manipulate((const float*)&world2view_guizmo, (const float*)&view2proj_guizmo, m_camera_path.m_gizmo_op, ImGuizmo::LOCAL, (float*)&matrix, NULL, NULL)) {
-			auto crop_transform = matrix;
 			if (m_edit_world_transform) {
 				// We transform the world by transforming the camera in the opposite direction.
 				auto rel = prev_matrix * inverse(matrix);
@@ -1844,7 +1832,7 @@ void Testbed::draw_visualizations(ImDrawList* list, const mat4x3& camera_matrix)
 				m_up_dir = mat3(rel) * m_up_dir;
 			} else {
 				m_render_aabb_to_local = transpose(mat3(matrix));
-				vec3 new_cen = m_render_aabb_to_local * matrix[3].xyz;
+				vec3 new_cen = m_render_aabb_to_local * matrix[3].xyz();
 				vec3 old_cen = m_render_aabb.center();
 				m_render_aabb.min += new_cen - old_cen;
 				m_render_aabb.max += new_cen - old_cen;
@@ -1921,9 +1909,6 @@ bool Testbed::keyboard_event() {
 		if (ImGui::IsKeyPressed('G')) {
 			m_render_ground_truth = !m_render_ground_truth;
 			reset_accumulation();
-			if (m_render_ground_truth) {
-				m_nerf.training.view = find_best_training_view(m_nerf.training.view);
-			}
 		}
 
 		if (ImGui::IsKeyPressed('T')) {
@@ -2029,7 +2014,7 @@ bool Testbed::keyboard_event() {
 
 	translate_vec *= m_camera_velocity * m_frame_ms.val() / 1000.0f;
 	if (shift) {
-		translate_vec *= 5;
+		translate_vec *= 5.0f;
 	}
 
 	if (translate_vec != vec3(0.0f)) {
@@ -2056,7 +2041,7 @@ void Testbed::mouse_wheel() {
 
 	// When in image mode, zoom around the hovered point.
 	if (m_testbed_mode == ETestbedMode::Image) {
-		ivec2 mouse = {ImGui::GetMousePos().x, ImGui::GetMousePos().y};
+		vec2 mouse = {ImGui::GetMousePos().x, ImGui::GetMousePos().y};
 		vec3 offset = get_3d_pos_from_pixel(*m_views.front().render_buffer, mouse) - look_at();
 
 		// Don't center around infinitely distant points.
@@ -2076,9 +2061,8 @@ mat3 Testbed::rotation_from_angles(const vec2& angles) const {
 
 void Testbed::mouse_drag() {
 	vec2 rel = vec2{ImGui::GetIO().MouseDelta.x, ImGui::GetIO().MouseDelta.y} / (float)m_window_res[m_fov_axis];
-	ivec2 mouse = {ImGui::GetMousePos().x, ImGui::GetMousePos().y};
+	vec2 mouse = {ImGui::GetMousePos().x, ImGui::GetMousePos().y};
 
-	vec3 up = m_up_dir;
 	vec3 side = m_camera[0];
 
 	bool shift = ImGui::GetIO().KeyMods & ImGuiKeyModFlags_Shift;
@@ -2122,7 +2106,7 @@ void Testbed::mouse_drag() {
 
 	// Middle pressed
 	if (ImGui::GetIO().MouseClicked[2]) {
-		m_drag_depth = get_depth_from_renderbuffer(*m_views.front().render_buffer, vec2(mouse) / vec2(m_window_res));
+		m_drag_depth = get_depth_from_renderbuffer(*m_views.front().render_buffer, mouse / vec2(m_window_res));
 	}
 
 	// Middle held
@@ -2176,11 +2160,12 @@ void Testbed::handle_user_input() {
 
 	if (m_testbed_mode == ETestbedMode::Nerf && (m_render_ground_truth || m_nerf.training.render_error_overlay)) {
 		// find nearest training view to current camera, and set it
-		int bestimage = find_best_training_view(-1);
-		if (bestimage >= 0) {
-			m_nerf.training.view = bestimage;
-			if (ImGui::GetIO().MouseReleased[0]) {// snap camera to ground truth view on mouse up
-				set_camera_to_training_view(m_nerf.training.view);
+		int bestimage = m_nerf.find_closest_training_view(m_camera);
+		m_nerf.training.view = bestimage;
+		if (ImGui::GetIO().MouseReleased[0]) { // snap camera to ground truth view on mouse up
+			set_camera_to_training_view(m_nerf.training.view);
+			if (m_nerf.training.dataset.n_extra_dims()) {
+				m_nerf.set_rendering_extra_dims_from_training_view(m_nerf.training.view);
 			}
 		}
 	}
@@ -2216,7 +2201,7 @@ void Testbed::begin_vr_frame_and_handle_vr_input() {
 	if (n_views > 0) {
 		set_n_views(n_views);
 
-		ivec2 total_size = ivec2(0);
+		ivec2 total_size = 0;
 		for (size_t i = 0; i < n_views; ++i) {
 			ivec2 view_resolution = {views[i].view.subImage.imageRect.extent.width, views[i].view.subImage.imageRect.extent.height};
 			total_size += view_resolution;
@@ -2522,7 +2507,7 @@ void Testbed::draw_gui() {
 	glBlendEquationSeparate(GL_FUNC_ADD, GL_FUNC_ADD);
 	glBlendFuncSeparate(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
 
-	ivec2 extent = ivec2((float)display_w / m_n_views.x, (float)display_h / m_n_views.y);
+	ivec2 extent = {(int)((float)display_w / m_n_views.x), (int)((float)display_h / m_n_views.y)};
 
 	int i = 0;
 	for (int y = 0; y < m_n_views.y; ++y) {
@@ -2547,7 +2532,7 @@ void Testbed::draw_gui() {
 
 	auto draw_mesh = [&]() {
 		glClear(GL_DEPTH_BUFFER_BIT);
-		ivec2 res(display_w, display_h);
+		ivec2 res = {display_w, display_h};
 		vec2 focal_length = calc_focal_length(res, m_relative_focal_length, m_fov_axis, m_zoom);
 		draw_mesh_gl(m_mesh.verts, m_mesh.vert_normals, m_mesh.vert_colors, m_mesh.indices, res, focal_length, m_smoothed_camera, render_screen_center(m_screen_center), (int)m_mesh_render_mode);
 	};
@@ -2597,11 +2582,11 @@ __global__ void to_8bit_color_kernel(
 	surf2Dread((float4*)&color, surface, x * sizeof(float4), y);
 
 	if (output_color_space == EColorSpace::Linear) {
-		color.rgb = linear_to_srgb(color.rgb);
+		color.rgb() = linear_to_srgb(color.rgb());
 	}
 
 	for (uint32_t i = 0; i < 3; ++i) {
-		result[(x + resolution.x * y) * 3 + i] = (uint8_t)(tcnn::clamp(color[i], 0.0f, 1.0f) * 255.0f + 0.5f);
+		result[(x + resolution.x * y) * 3 + i] = (uint8_t)(clamp(color[i], 0.0f, 1.0f) * 255.0f + 0.5f);
 	}
 }
 
@@ -2627,7 +2612,7 @@ void Testbed::prepare_next_camera_path_frame() {
 		const dim3 threads = { 16, 8, 1 };
 		const dim3 blocks = { div_round_up((uint32_t)res.x, threads.x), div_round_up((uint32_t)res.y, threads.y), 1 };
 
-		GPUMemory<uint8_t> image_data(compMul(res) * 3);
+		GPUMemory<uint8_t> image_data(product(res) * 3);
 		to_8bit_color_kernel<<<blocks, threads>>>(
 			res,
 			EColorSpace::SRGB, // the GUI always renders in SRGB
@@ -2757,7 +2742,7 @@ void Testbed::train_and_render(bool skip_rendering) {
 		m_render_ms.update(std::chrono::duration<float, std::milli>(std::chrono::steady_clock::now()-start).count());
 	}};
 
-	if (norm(m_smoothed_camera - m_camera) < 0.001f) {
+	if (frobenius_norm(m_smoothed_camera - m_camera) < 0.001f) {
 		m_smoothed_camera = m_camera;
 	} else if (!m_camera_path.rendering) {
 		reset_accumulation(true);
@@ -2773,7 +2758,7 @@ void Testbed::train_and_render(bool skip_rendering) {
 			view.visualized_dimension = m_visualized_dimension;
 		}
 
-		m_n_views = {m_views.size(), 1};
+		m_n_views = {(int)m_views.size(), 1};
 
 		m_nerf.render_with_lens_distortion = false;
 		reset_accumulation(true);
@@ -2849,8 +2834,8 @@ void Testbed::train_and_render(bool skip_rendering) {
 
 		size_t n_pixels = 0, n_pixels_full_res = 0;
 		for (const auto& view : m_views) {
-			n_pixels += compMul(view.render_buffer->in_resolution());
-			n_pixels_full_res += compMul(view.full_resolution);
+			n_pixels += product(view.render_buffer->in_resolution());
+			n_pixels_full_res += product(view.full_resolution);
 		}
 
 		float pixel_ratio = (n_pixels == 0 || (m_train && m_training_step == 0)) ? (1.0f / 256.0f) : ((float)n_pixels / (float)n_pixels_full_res);
@@ -2861,7 +2846,7 @@ void Testbed::train_and_render(bool skip_rendering) {
 			factor = 8.f / (float)m_fixed_res_factor;
 		}
 
-		factor = tcnn::clamp(factor, 1.0f / 16.0f, 1.0f);
+		factor = clamp(factor, 1.0f / 16.0f, 1.0f);
 
 		for (auto&& view : m_views) {
 			if (m_dlss) {
@@ -2877,7 +2862,7 @@ void Testbed::train_and_render(bool skip_rendering) {
 				new_render_res = m_camera_path.render_settings.resolution;
 			}
 
-			float ratio = std::sqrt((float)compMul(render_res) / (float)compMul(new_render_res));
+			float ratio = std::sqrt((float)product(render_res) / (float)product(new_render_res));
 			if (ratio > 1.2f || ratio < 0.8f || factor == 1.0f || !m_dynamic_res || m_camera_path.rendering) {
 				render_res = new_render_res;
 			}
@@ -2902,7 +2887,7 @@ void Testbed::train_and_render(bool skip_rendering) {
 					resolution_scale = clamp(resolution_scale * foveation_begin_factor, vec2(1.0f / m_foveated_rendering_max_scaling), vec2(1.0f));
 					view.foveation = {resolution_scale, vec2(1.0f) - view.screen_center, vec2(m_foveated_rendering_full_res_diameter * 0.5f)};
 
-					m_foveated_rendering_scaling = 2.0f / compAdd(resolution_scale);
+					m_foveated_rendering_scaling = 2.0f / sum(resolution_scale);
 				} else {
 					view.foveation = {vec2(1.0f / m_foveated_rendering_scaling), vec2(1.0f) - view.screen_center, vec2(m_foveated_rendering_full_res_diameter * 0.5f)};
 				}
@@ -2954,7 +2939,7 @@ void Testbed::train_and_render(bool skip_rendering) {
 	}
 
 	if (m_picture_in_picture_res > 0) {
-		ivec2 res(m_picture_in_picture_res, m_picture_in_picture_res * 9/16);
+		ivec2 res{(int)m_picture_in_picture_res, (int)(m_picture_in_picture_res * 9.0f / 16.0f)};
 		m_pip_render_buffer->resize(res);
 		if (m_pip_render_buffer->spp() < 8) {
 			// a bit gross, but let's copy the keyframe's state into the global state in order to not have to plumb through the fov etc to render_frame.
@@ -3122,7 +3107,6 @@ void Testbed::init_window(int resw, int resh, bool hidden, bool second_window) {
 			return;
 		}
 
-		testbed->redraw_gui_next_frame();
 		for (int i = 0; i < count; i++) {
 			testbed->load_file(paths[i]);
 		}
@@ -3137,7 +3121,11 @@ void Testbed::init_window(int resw, int resh, bool hidden, bool second_window) {
 
 	glfwSetCursorPosCallback(m_glfw_window, [](GLFWwindow* window, double xpos, double ypos) {
 		Testbed* testbed = (Testbed*)glfwGetWindowUserPointer(window);
-		if (testbed) {
+		if (
+			testbed &&
+			(ImGui::IsAnyItemActive() || ImGui::GetIO().WantCaptureMouse || ImGuizmo::IsUsing()) &&
+			(ImGui::GetIO().MouseDown[0] || ImGui::GetIO().MouseDown[1] || ImGui::GetIO().MouseDown[2])
+		) {
 			testbed->redraw_gui_next_frame();
 		}
 	});
@@ -3355,7 +3343,7 @@ bool Testbed::frame() {
 
 	// Render against the trained neural network. If we're training and already close to convergence,
 	// we can skip rendering if the scene camera doesn't change
-	uint32_t n_to_skip = m_train ? tcnn::clamp(m_training_step / 16u, 15u, 255u) : 0;
+	uint32_t n_to_skip = m_train ? clamp(m_training_step / 16u, 15u, 255u) : 0;
 	if (m_render_skip_due_to_lack_of_camera_movement_counter > n_to_skip) {
 		m_render_skip_due_to_lack_of_camera_movement_counter = 0;
 	}
@@ -3379,7 +3367,7 @@ bool Testbed::frame() {
 	}
 #endif
 
-	if (!skip_rendering || (std::chrono::steady_clock::now() - m_last_gui_draw_time_point) > 25ms) {
+	if (!skip_rendering || std::chrono::steady_clock::now() - m_last_gui_draw_time_point > 50ms) {
 		redraw_gui_next_frame();
 	}
 
@@ -3387,7 +3375,7 @@ bool Testbed::frame() {
 		while (true) {
 			(*m_task_queue.tryPop())();
 		}
-	} catch (SharedQueueEmptyException&) {}
+	} catch (const SharedQueueEmptyException&) {}
 
 
 	train_and_render(skip_rendering);
@@ -3633,9 +3621,9 @@ void Testbed::reset_network(bool clear_density_grid) {
 	if (m_testbed_mode == ETestbedMode::Nerf) {
 		m_nerf.training.loss_type = string_to_loss_type(loss_config.value("otype", "L2"));
 
-		// Some of the Nerf-supported losses are not supported by tcnn::Loss,
+		// Some of the Nerf-supported losses are not supported by Loss,
 		// so just create a dummy L2 loss there. The NeRF code path will bypass
-		// the tcnn::Loss in any case.
+		// the Loss in any case.
 		loss_config["otype"] = "L2";
 	}
 
@@ -3664,7 +3652,7 @@ void Testbed::reset_network(bool clear_density_grid) {
 
 		float desired_resolution = 2048.0f; // Desired resolution of the finest hashgrid level over the unit cube
 		if (m_testbed_mode == ETestbedMode::Image) {
-			desired_resolution = compMax(m_image.resolution) / 2.0f;
+			desired_resolution = max(m_image.resolution) / 2.0f;
 		} else if (m_testbed_mode == ETestbedMode::Volume) {
 			desired_resolution = m_volume.world2index_scale;
 		}
@@ -3686,8 +3674,8 @@ void Testbed::reset_network(bool clear_density_grid) {
 			;
 	}
 
-	m_loss.reset(create_loss<precision_t>(loss_config));
-	m_optimizer.reset(create_optimizer<precision_t>(optimizer_config));
+	m_loss.reset(create_loss<network_precision_t>(loss_config));
+	m_optimizer.reset(create_optimizer<network_precision_t>(optimizer_config));
 
 	size_t n_encoding_params = 0;
 	if (m_testbed_mode == ETestbedMode::Nerf) {
@@ -3696,7 +3684,7 @@ void Testbed::reset_network(bool clear_density_grid) {
 		m_nerf.training.cam_rot_offset.resize(m_nerf.training.dataset.n_images, RotationAdamOptimizer(1e-4f));
 		m_nerf.training.cam_focal_length_offset = AdamOptimizer<vec2>(1e-5f);
 
-		m_nerf.training.reset_extra_dims(m_rng);
+		m_nerf.reset_extra_dims(m_rng);
 
 		json& dir_encoding_config = config["dir_encoding"];
 		json& rgb_network_config = config["rgb_network"];
@@ -3706,7 +3694,7 @@ void Testbed::reset_network(bool clear_density_grid) {
 
 		// Instantiate an additional model for each auxiliary GPU
 		for (auto& device : m_devices) {
-			device.set_nerf_network(std::make_shared<NerfNetwork<precision_t>>(
+			device.set_nerf_network(std::make_shared<NerfNetwork<network_precision_t>>(
 				dims.n_pos,
 				n_dir_dims,
 				n_extra_dims,
@@ -3741,7 +3729,6 @@ void Testbed::reset_network(bool clear_density_grid) {
 			<< "]-->" << 3
 			;
 
-
 		// Create distortion map model
 		{
 			json& distortion_map_optimizer_config =  config.contains("distortion_map") && config["distortion_map"].contains("optimizer") ? config["distortion_map"]["optimizer"] : optimizer_config;
@@ -3769,15 +3756,15 @@ void Testbed::reset_network(bool clear_density_grid) {
 				m_sdf.brick_data.free_memory();
 			}
 
-			m_encoding.reset(new TakikawaEncoding<precision_t>(
+			m_encoding.reset(new TakikawaEncoding<network_precision_t>(
 				encoding_config["starting_level"],
 				m_sdf.triangle_octree,
-				tcnn::string_to_interpolation_type(encoding_config.value("interpolation", "linear"))
+				string_to_interpolation_type(encoding_config.value("interpolation", "linear"))
 			));
 
 			m_sdf.uses_takikawa_encoding = true;
 		} else {
-			m_encoding.reset(create_encoding<precision_t>(dims.n_input, encoding_config));
+			m_encoding.reset(create_encoding<network_precision_t>(dims.n_input, encoding_config));
 
 			m_sdf.uses_takikawa_encoding = false;
 			if (m_sdf.octree_depth_target == 0 && encoding_config.contains("n_levels")) {
@@ -3786,7 +3773,7 @@ void Testbed::reset_network(bool clear_density_grid) {
 		}
 
 		for (auto& device : m_devices) {
-			device.set_network(std::make_shared<NetworkWithInputEncoding<precision_t>>(m_encoding, dims.n_output, network_config));
+			device.set_network(std::make_shared<NetworkWithInputEncoding<network_precision_t>>(m_encoding, dims.n_output, network_config));
 		}
 
 		m_network = primary_device().network();
@@ -3799,14 +3786,15 @@ void Testbed::reset_network(bool clear_density_grid) {
 			<< "]-->" << m_encoding->padded_output_width()
 			<< "--[" << std::string(network_config["otype"])
 			<< "(neurons=" << (int)network_config["n_neurons"] << ",layers=" << ((int)network_config["n_hidden_layers"]+2) << ")"
-			<< "]-->" << dims.n_output;
+			<< "]-->" << dims.n_output
+			;
 	}
 
 	size_t n_network_params = m_network->n_params() - n_encoding_params;
 
 	tlog::info() << "  total_encoding_params=" << n_encoding_params << " total_network_params=" << n_network_params;
 
-	m_trainer = std::make_shared<Trainer<float, precision_t, precision_t>>(m_network, m_optimizer, m_loss, m_seed);
+	m_trainer = std::make_shared<Trainer<float, network_precision_t, network_precision_t>>(m_network, m_optimizer, m_loss, m_seed);
 	m_training_step = 0;
 	m_training_start_time_point = std::chrono::steady_clock::now();
 
@@ -3831,6 +3819,19 @@ void Testbed::reset_network(bool clear_density_grid) {
 }
 
 Testbed::Testbed(ETestbedMode mode) {
+	tcnn::set_log_callback([](LogSeverity severity, const std::string& msg) {
+		tlog::ESeverity s = tlog::ESeverity::Info;
+		switch (severity) {
+			case LogSeverity::Info: s = tlog::ESeverity::Info; break;
+			case LogSeverity::Debug: s = tlog::ESeverity::Debug; break;
+			case LogSeverity::Warning: s = tlog::ESeverity::Warning; break;
+			case LogSeverity::Error: s = tlog::ESeverity::Error; break;
+			case LogSeverity::Success: s = tlog::ESeverity::Success; break;
+			default: break;
+		}
+		tlog::log(s) << msg;
+	});
+
 	if (!(__CUDACC_VER_MAJOR__ > 10 || (__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2))) {
 		throw std::runtime_error{"Testbed requires CUDA 10.2 or later."};
 	}
@@ -3866,7 +3867,13 @@ Testbed::Testbed(ETestbedMode mode) {
 
 	int active_device = cuda_device();
 	int active_compute_capability = cuda_compute_capability();
-	tlog::success() << "Initialized CUDA. Active GPU is #" << active_device << ": " << cuda_device_name() << " [" << active_compute_capability << "]";
+	tlog::success() << fmt::format(
+		"Initialized CUDA {}. Active GPU is #{}: {} [{}]",
+		cuda_runtime_version_string(),
+		active_device,
+		cuda_device_name(),
+		active_compute_capability
+	);
 
 	if (active_compute_capability < MIN_GPU_ARCH) {
 		tlog::warning() << "Insufficient compute capability " << active_compute_capability << " detected.";
@@ -3996,7 +4003,7 @@ void Testbed::train(uint32_t batch_size) {
 		reset_accumulation(false, false);
 	}
 
-	uint32_t n_prep_to_skip = m_testbed_mode == ETestbedMode::Nerf ? tcnn::clamp(m_training_step / 16u, 1u, 16u) : 1u;
+	uint32_t n_prep_to_skip = m_testbed_mode == ETestbedMode::Nerf ? clamp(m_training_step / 16u, 1u, 16u) : 1u;
 	if (m_training_step % n_prep_to_skip == 0) {
 		auto start = std::chrono::steady_clock::now();
 		ScopeGuard timing_guard{[&]() {
@@ -4053,7 +4060,7 @@ vec2 Testbed::calc_focal_length(const ivec2& resolution, const vec2& relative_fo
 
 vec2 Testbed::render_screen_center(const vec2& screen_center) const {
 	// see pixel_to_ray for how screen center is used; 0.5, 0.5 is 'normal'. we flip so that it becomes the point in the original image we want to center on.
-	return (vec2(0.5f) - screen_center) * m_zoom + vec2(0.5f);
+	return (0.5f - screen_center) * m_zoom + 0.5f;
 }
 
 __global__ void dlss_prep_kernel(
@@ -4090,7 +4097,7 @@ __global__ void dlss_prep_kernel(
 	const float depth = depth_buffer[idx];
 	vec2 mvec = motion_vector(
 		sample_index,
-		{x, y},
+		{(int)x, (int)y},
 		resolution,
 		focal_length,
 		camera,
@@ -4137,7 +4144,7 @@ __global__ void spherical_checkerboard_kernel(
 
 	Ray ray = pixel_to_ray(
 		0,
-		{x, y},
+		{(int)x, (int)y},
 		resolution,
 		focal_length,
 		camera,
@@ -4161,7 +4168,7 @@ __global__ void spherical_checkerboard_kernel(
 
 	// Blend background color on top of checkerboard first (checkerboard is meant to be "behind" the background,
 	// representing transparency), and then blend the result behind the frame buffer.
-	background_color.rgb = srgb_to_linear(background_color.rgb);
+	background_color.rgb() = srgb_to_linear(background_color.rgb());
 	background_color += (1.0f - background_color.a) * checker;
 
 	uint32_t idx = x + resolution.x * y;
@@ -4196,7 +4203,7 @@ __global__ void vr_overlay_hands_kernel(
 
 	Ray ray = pixel_to_ray(
 		0,
-		{x, y},
+		{(int)x, (int)y},
 		resolution,
 		focal_length,
 		camera,
@@ -4255,13 +4262,13 @@ __global__ void vr_overlay_hands_kernel(
 	vec4 prev_color;
 	surf2Dread((float4*)&prev_color, surface, x * sizeof(float4), y);
 	if (output_color_space == EColorSpace::SRGB) {
-		prev_color.rgb = srgb_to_linear(prev_color.rgb);
+		prev_color.rgb() = srgb_to_linear(prev_color.rgb());
 	}
 
 	color += (1.0f - color.a) * prev_color;
 
 	if (output_color_space == EColorSpace::SRGB) {
-		color.rgb = linear_to_srgb(color.rgb);
+		color.rgb() = linear_to_srgb(color.rgb());
 	}
 
 	surf2Dwrite(to_float4(color), surface, x * sizeof(float4), y);
@@ -4318,7 +4325,7 @@ void Testbed::render_frame_main(
 	switch (m_testbed_mode) {
 		case ETestbedMode::Nerf:
 			if (!m_render_ground_truth || m_ground_truth_alpha < 1.0f) {
-				render_nerf(device.stream(), device.render_buffer_view(), *device.nerf_network(), device.data().density_grid_bitfield_ptr, focal_length, camera_matrix0, camera_matrix1, nerf_rolling_shutter, screen_center, foveation, visualized_dimension);
+				render_nerf(device.stream(), device, device.render_buffer_view(), device.nerf_network(), device.data().density_grid_bitfield_ptr, focal_length, camera_matrix0, camera_matrix1, nerf_rolling_shutter, screen_center, foveation, visualized_dimension);
 			}
 			break;
 		case ETestbedMode::Sdf:
@@ -4370,7 +4377,7 @@ void Testbed::render_frame_main(
 							);
 						}
 					} : (distance_fun_t)[&](uint32_t n_elements, const vec3* positions, float* distances, cudaStream_t stream) {
-						n_elements = next_multiple(n_elements, tcnn::batch_size_granularity);
+						n_elements = next_multiple(n_elements, BATCH_SIZE_GRANULARITY);
 						GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements);
 						GPUMatrix<float, RM> distances_matrix(distances, 1, n_elements);
 						m_network->inference(stream, positions_matrix, distances_matrix);
@@ -4380,7 +4387,7 @@ void Testbed::render_frame_main(
 					m_render_ground_truth ? (normals_fun_t)[&](uint32_t n_elements, const vec3* positions, vec3* normals, cudaStream_t stream) {
 						// NO-OP. Normals will automatically be populated by raytrace
 					} : (normals_fun_t)[&](uint32_t n_elements, const vec3* positions, vec3* normals, cudaStream_t stream) {
-						n_elements = next_multiple(n_elements, tcnn::batch_size_granularity);
+						n_elements = next_multiple(n_elements, BATCH_SIZE_GRANULARITY);
 						GPUMatrix<float> positions_matrix((float*)positions, 3, n_elements);
 						GPUMatrix<float> normals_matrix((float*)normals, 3, n_elements);
 						m_network->input_gradient(stream, 0, positions_matrix, normals_matrix);
@@ -4463,7 +4470,7 @@ void Testbed::render_frame_epilogue(
 	EColorSpace output_color_space = to_srgb ? EColorSpace::SRGB : EColorSpace::Linear;
 
 	if (m_render_transparency_as_checkerboard) {
-		mat4x3 checkerboard_transform = mat4x3(1.0f);
+		mat4x3 checkerboard_transform = mat4x3::identity();
 
 #ifdef NGP_GUI
 		if (m_hmd && m_vr_frame_info && !m_vr_frame_info->views.empty()) {
@@ -4582,15 +4589,15 @@ float Testbed::get_depth_from_renderbuffer(const CudaRenderBuffer& render_buffer
 
 	float depth;
 	auto res = render_buffer.in_resolution();
-	ivec2 depth_pixel = clamp(ivec2(uv * vec2(res)), ivec2(0), res - ivec2(1));
+	ivec2 depth_pixel = clamp(ivec2(uv * vec2(res)), 0, res - 1);
 
 	CUDA_CHECK_THROW(cudaMemcpy(&depth, render_buffer.depth_buffer() + depth_pixel.x + depth_pixel.y * res.x, sizeof(float), cudaMemcpyDeviceToHost));
 	return depth;
 }
 
-vec3 Testbed::get_3d_pos_from_pixel(const CudaRenderBuffer& render_buffer, const ivec2& pixel) {
-	float depth = get_depth_from_renderbuffer(render_buffer, vec2(pixel) / vec2(m_window_res));
-	auto ray = pixel_to_ray_pinhole(0, pixel, m_window_res, calc_focal_length(m_window_res, m_relative_focal_length, m_fov_axis, m_zoom), m_smoothed_camera, render_screen_center(m_screen_center));
+vec3 Testbed::get_3d_pos_from_pixel(const CudaRenderBuffer& render_buffer, const vec2& pixel) {
+	float depth = get_depth_from_renderbuffer(render_buffer, pixel / vec2(m_window_res));
+	auto ray = pixel_to_ray_pinhole(0, ivec2(pixel), m_window_res, calc_focal_length(m_window_res, m_relative_focal_length, m_fov_axis, m_zoom), m_smoothed_camera, render_screen_center(m_screen_center));
 	return ray(depth);
 }
 
@@ -4645,7 +4652,7 @@ void Testbed::gather_histograms() {
 		CUDA_CHECK_THROW(cudaStreamSynchronize(m_stream.get()));
 
 
-		for (int l = 0; l < m_n_levels; ++l) {
+		for (uint32_t l = 0; l < m_n_levels; ++l) {
 			m_level_stats[l] = compute_level_stats(grid.data() + hg_enc->level_params_offset(l), hg_enc->level_n_params(l));
 		}
 
@@ -4861,8 +4868,32 @@ void Testbed::load_snapshot(const fs::path& path) {
 	set_all_devices_dirty();
 }
 
-void Testbed::CudaDevice::set_nerf_network(const std::shared_ptr<NerfNetwork<precision_t>>& nerf_network) {
-	m_network = m_nerf_network = nerf_network;
+Testbed::CudaDevice::CudaDevice(int id, bool is_primary) : m_id{id}, m_is_primary{is_primary} {
+	auto guard = device_guard();
+	m_stream = std::make_unique<StreamAndEvent>();
+	m_data = std::make_unique<Data>();
+	m_render_worker = std::make_unique<ThreadPool>(is_primary ? 0u : 1u);
+}
+
+ScopeGuard Testbed::CudaDevice::device_guard() {
+	int prev_device = cuda_device();
+	if (prev_device == m_id) {
+		return {};
+	}
+
+	set_cuda_device(m_id);
+	return ScopeGuard{[prev_device]() {
+		set_cuda_device(prev_device);
+	}};
+}
+
+void Testbed::CudaDevice::set_network(const std::shared_ptr<Network<float, network_precision_t>>& network) {
+	m_network = network;
+}
+
+void Testbed::CudaDevice::set_nerf_network(const std::shared_ptr<NerfNetwork<network_precision_t>>& nerf_network) {
+	m_nerf_network = nerf_network;
+	set_network(nerf_network);
 }
 
 void Testbed::sync_device(CudaRenderBuffer& render_buffer, Testbed::CudaDevice& device) {
@@ -4904,6 +4935,7 @@ void Testbed::sync_device(CudaRenderBuffer& render_buffer, Testbed::CudaDevice&
 	}
 
 	device.set_dirty(false);
+	device.signal(m_stream.get());
 }
 
 // From https://stackoverflow.com/questions/20843271/passing-a-non-copyable-closure-object-to-stdfunction-parameter
@@ -4930,7 +4962,7 @@ ScopeGuard Testbed::use_device(cudaStream_t stream, CudaRenderBuffer& render_buf
 	int active_device = cuda_device();
 	auto guard = device.device_guard();
 
-	size_t n_pixels = compMul(render_buffer.in_resolution());
+	size_t n_pixels = product(render_buffer.in_resolution());
 
 	GPUMemoryArena::Allocation alloc;
 	auto scratch = allocate_workspace_and_distribute<vec4, float>(device.stream(), &alloc, n_pixels, n_pixels);
@@ -4945,8 +4977,8 @@ ScopeGuard Testbed::use_device(cudaStream_t stream, CudaRenderBuffer& render_buf
 
 	return ScopeGuard{make_copyable_function([&render_buffer, &device, guard=std::move(guard), alloc=std::move(alloc), active_device, stream]() {
 		// Copy device's render buffer's data onto the original render buffer
-		CUDA_CHECK_THROW(cudaMemcpyPeerAsync(render_buffer.frame_buffer(), active_device, device.render_buffer_view().frame_buffer, device.id(), compMul(render_buffer.in_resolution()) * sizeof(vec4), device.stream()));
-		CUDA_CHECK_THROW(cudaMemcpyPeerAsync(render_buffer.depth_buffer(), active_device, device.render_buffer_view().depth_buffer, device.id(), compMul(render_buffer.in_resolution()) * sizeof(float), device.stream()));
+		CUDA_CHECK_THROW(cudaMemcpyPeerAsync(render_buffer.frame_buffer(), active_device, device.render_buffer_view().frame_buffer, device.id(), product(render_buffer.in_resolution()) * sizeof(vec4), device.stream()));
+		CUDA_CHECK_THROW(cudaMemcpyPeerAsync(render_buffer.depth_buffer(), active_device, device.render_buffer_view().depth_buffer, device.id(), product(render_buffer.in_resolution()) * sizeof(float), device.stream()));
 
 		device.set_render_buffer_view({});
 		device.signal(stream);
@@ -4960,7 +4992,7 @@ void Testbed::set_all_devices_dirty() {
 }
 
 void Testbed::load_camera_path(const fs::path& path) {
-	m_camera_path.load(path, mat4x3(1.0f));
+	m_camera_path.load(path, mat4x3::identity());
 }
 
 bool Testbed::loop_animation() {
@@ -4971,5 +5003,5 @@ void Testbed::set_loop_animation(bool value) {
 	m_camera_path.loop = value;
 }
 
-NGP_NAMESPACE_END
+}
 
diff --git a/src/testbed_image.cu b/src/testbed_image.cu
index bdaee92a3..329ec83f6 100644
--- a/src/testbed_image.cu
+++ b/src/testbed_image.cu
@@ -26,9 +26,7 @@
 
 #include <fstream>
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 Testbed::NetworkDims Testbed::network_dims_image() const {
 	NetworkDims dims;
@@ -104,7 +102,7 @@ __global__ void init_image_coords(
 	// Hence: generate rays and intersect that plane.
 	Ray ray = pixel_to_ray(
 		sample_index,
-		{x, y},
+		{(int)x, (int)y},
 		resolution,
 		focal_length,
 		camera_matrix,
@@ -128,11 +126,11 @@ __global__ void init_image_coords(
 		return;
 	}
 
-	vec2 uv = ray(t).xy;
+	vec2 uv = ray(t).xy();
 
 	// Flip from world coordinates where Y goes up to image coordinates where Y goes down.
 	// Also, multiply the x-axis by the image's aspect ratio to make it have the right proportions.
-	uv = (uv - vec2(0.5f)) * vec2(aspect, -1.0f) + vec2(0.5f);
+	uv = (uv - 0.5f) * vec2{aspect, -1.0f} + 0.5f;
 
 	depth_buffer[idx] = t;
 	positions[idx] = uv;
@@ -173,10 +171,10 @@ __global__ void eval_image_kernel_and_snap(uint32_t n_elements, const T* __restr
 	vec2 pos = positions[i];
 
 	auto read_val = [&](int x, int y) {
-		auto val = ((tcnn::vector_t<T, 4>*)texture)[y * resolution.x + x];
-		vec4 result{val[0], val[1], val[2], val[3]};
+		auto val = ((tvec<T, 4>*)texture)[y * resolution.x + x];
+		vec4 result{(float)val[0], (float)val[1], (float)val[2], (float)val[3]};
 		if (!linear_colors) {
-			result.rgb = linear_to_srgb(result.rgb);
+			result.rgb() = linear_to_srgb(result.rgb());
 		}
 		return result;
 	};
@@ -184,16 +182,16 @@ __global__ void eval_image_kernel_and_snap(uint32_t n_elements, const T* __restr
 	vec4 val;
 	if (snap_to_pixel_centers) {
 		ivec2 pos_int = floor(pos * vec2(resolution));
-		positions[i] = (vec2(pos_int) + vec2(0.5f)) / vec2(resolution);
-		pos_int = clamp(pos_int, ivec2(0), resolution - ivec2(1));
+		positions[i] = (vec2(pos_int) + 0.5f) / vec2(resolution);
+		pos_int = clamp(pos_int, 0, resolution - 1);
 		val = read_val(pos_int.x, pos_int.y);
 	} else {
-		pos = clamp(pos * vec2(resolution) - vec2(0.5f), vec2(0.0f), vec2(resolution) - vec2(1.0f + 1e-4f));
+		pos = clamp(pos * vec2(resolution) - 0.5f, 0.0f, vec2(resolution) - (1.0f + 1e-4f));
 
 		const ivec2 pos_int = pos;
 		const vec2 weight = pos - vec2(pos_int);
 
-		const ivec2 idx = clamp(pos_int, ivec2(0), resolution - ivec2(2));
+		const ivec2 idx = clamp(pos_int, 0, resolution - 2);
 
 		val =
 			(1 - weight.x) * (1 - weight.y) * read_val(idx.x, idx.y) +
@@ -215,11 +213,8 @@ void Testbed::train_image(size_t target_batch_size, bool get_loss_scalar, cudaSt
 	const uint32_t n_output_dims = 3;
 	const uint32_t n_input_dims = 2;
 
-	// Auxiliary matrices for training
 	const uint32_t batch_size = (uint32_t)target_batch_size;
 
-	// Permute all training records to de-correlate training data
-
 	const uint32_t n_elements = batch_size;
 	m_image.training.positions.enlarge(n_elements);
 	m_image.training.targets.enlarge(n_elements);
@@ -271,16 +266,11 @@ void Testbed::train_image(size_t target_batch_size, bool get_loss_scalar, cudaSt
 	GPUMatrix<float> training_batch_matrix((float*)(m_image.training.positions.data()), n_input_dims, batch_size);
 	GPUMatrix<float> training_target_matrix((float*)(m_image.training.targets.data()), n_output_dims, batch_size);
 
-
-	{
-		auto ctx = m_trainer->training_step(stream, training_batch_matrix, training_target_matrix, nullptr, false);
-		if (get_loss_scalar) {
-			m_loss_scalar.update(m_trainer->loss(stream, *ctx));
-		}
+	auto ctx = m_trainer->training_step(stream, training_batch_matrix, training_target_matrix);
+	if (get_loss_scalar) {
+		m_loss_scalar.update(m_trainer->loss(stream, *ctx));
 	}
 
-
-	m_trainer->optimizer_step(stream, 128);
 	m_training_step++;
 }
 
@@ -297,7 +287,7 @@ void Testbed::render_image(
 
 	// Make sure we have enough memory reserved to render at the requested resolution
 	size_t n_pixels = (size_t)res.x * res.y;
-	uint32_t n_elements = next_multiple((uint32_t)n_pixels, tcnn::batch_size_granularity);
+	uint32_t n_elements = next_multiple((uint32_t)n_pixels, BATCH_SIZE_GRANULARITY);
 	m_image.render_coords.enlarge(n_elements);
 	m_image.render_out.enlarge(n_elements);
 
@@ -379,7 +369,7 @@ void Testbed::load_image(const fs::path& data_path) {
 	}
 
 	m_aabb = m_render_aabb = BoundingBox{vec3(0.0f), vec3(1.0f)};
-	m_render_aabb_to_local = mat3(1.0f);
+	m_render_aabb_to_local = mat3::identity();
 
 	tlog::success()
 		<< "Loaded a " << (m_image.type == EDataType::Half ? "half" : "full") << "-precision image with "
@@ -446,7 +436,7 @@ __global__ void image_coords_from_idx(const uint32_t n_elements, uint32_t offset
 	int x = idx % resolution.x;
 	int y = idx / resolution.x;
 
-	pos[i] = (vec2(clamp(ivec2{x, y}, ivec2(0), resolution - ivec2(1))) + vec2(0.5f)) / vec2(resolution);
+	pos[i] = (vec2(clamp(ivec2{x, y}, 0, resolution - 1)) + 0.5f) / vec2(resolution);
 }
 
 __global__ void image_mse_kernel(const uint32_t n_elements, const vec3* __restrict__ target, const vec3* __restrict__ prediction, float* __restrict__ result, bool quantize_to_byte) {
@@ -455,7 +445,7 @@ __global__ void image_mse_kernel(const uint32_t n_elements, const vec3* __restri
 
 	vec3 pred = prediction[i];
 	if (quantize_to_byte) {
-		pred = vec3(clamp(ivec3(pred * 255.0f + vec3(0.5f)), ivec3(0), ivec3(255))) / 255.0f;
+		pred = vec3(clamp(ivec3(pred * 255.0f + 0.5f), 0, 255)) / 255.0f;
 	}
 
 	const vec3 diff = target[i] - pred;
@@ -467,7 +457,7 @@ float Testbed::compute_image_mse(bool quantize_to_byte) {
 	const uint32_t n_input_dims = 2;
 
 	// Auxiliary matrices for training
-	const uint32_t n_elements = compMul(m_image.resolution);
+	const uint32_t n_elements = product(m_image.resolution);
 	const uint32_t max_batch_size = 1u<<20;
 
 	GPUMemory<float> se(n_elements);
@@ -526,4 +516,4 @@ float Testbed::compute_image_mse(bool quantize_to_byte) {
 	return reduce_sum(se.data(), n_elements, nullptr) / n_elements;
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu
index 2ef067c2a..404512d5e 100644
--- a/src/testbed_nerf.cu
+++ b/src/testbed_nerf.cu
@@ -41,26 +41,7 @@
 #undef copysign
 #endif
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
-
-inline constexpr __device__ float NERF_RENDERING_NEAR_DISTANCE() { return 0.05f; }
-inline constexpr __device__ uint32_t NERF_STEPS() { return 1024; } // finest number of steps per unit length
-inline constexpr __device__ uint32_t NERF_CASCADES() { return 8; }
-
-inline constexpr __device__ float SQRT3() { return 1.73205080757f; }
-inline constexpr __device__ float STEPSIZE() { return (SQRT3() / NERF_STEPS()); } // for nerf raymarch
-inline constexpr __device__ float MIN_CONE_STEPSIZE() { return STEPSIZE(); }
-// Maximum step size is the width of the coarsest gridsize cell.
-inline constexpr __device__ float MAX_CONE_STEPSIZE() { return STEPSIZE() * (1<<(NERF_CASCADES()-1)) * NERF_STEPS() / NERF_GRIDSIZE(); }
-
-// Used to index into the PRNG stream. Must be larger than the number of
-// samples consumed by any given training ray.
-inline constexpr __device__ uint32_t N_MAX_RANDOM_SAMPLES_PER_RAY() { return 16; }
-
-// Any alpha below this is considered "invisible" and is thus culled away.
-inline constexpr __device__ float NERF_MIN_OPTICAL_THICKNESS() { return 0.01f; }
+namespace ngp {
 
 static constexpr uint32_t MARCH_ITER = 10000;
 
@@ -75,336 +56,6 @@ Testbed::NetworkDims Testbed::network_dims_nerf() const {
 	return dims;
 }
 
-inline __host__ __device__ uint32_t grid_mip_offset(uint32_t mip) {
-	return NERF_GRID_N_CELLS() * mip;
-}
-
-inline __host__ __device__ float calc_cone_angle(float cosine, const vec2& focal_length, float cone_angle_constant) {
-	// Pixel size. Doesn't always yield a good performance vs. quality
-	// trade off. Especially if training pixels have a much different
-	// size than rendering pixels.
-	// return cosine*cosine / focal_length.mean();
-
-	return cone_angle_constant;
-}
-
-inline __host__ __device__ float to_stepping_space(float t, float cone_angle) {
-	if (cone_angle <= 1e-5f) {
-		return t / MIN_CONE_STEPSIZE();
-	}
-
-	float log1p_c = logf(1.0f + cone_angle);
-
-	float a = (logf(MIN_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c;
-	float b = (logf(MAX_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c;
-
-	float at = expf(a * log1p_c);
-	float bt = expf(b * log1p_c);
-
-	if (t <= at) {
-		return (t - at) / MIN_CONE_STEPSIZE() + a;
-	} else if (t <= bt) {
-		return logf(t) / log1p_c;
-	} else {
-		return (t - bt) / MAX_CONE_STEPSIZE() + b;
-	}
-}
-
-inline __host__ __device__ float from_stepping_space(float n, float cone_angle) {
-	if (cone_angle <= 1e-5f) {
-		return n * MIN_CONE_STEPSIZE();
-	}
-
-	float log1p_c = logf(1.0f + cone_angle);
-
-	float a = (logf(MIN_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c;
-	float b = (logf(MAX_CONE_STEPSIZE()) - logf(log1p_c)) / log1p_c;
-
-	float at = expf(a * log1p_c);
-	float bt = expf(b * log1p_c);
-
-	if (n <= a) {
-		return (n - a) * MIN_CONE_STEPSIZE() + at;
-	} else if (n <= b) {
-		return expf(n * log1p_c);
-	} else {
-		return (n - b) * MAX_CONE_STEPSIZE() + bt;
-	}
-}
-
-inline __host__ __device__ float advance_n_steps(float t, float cone_angle, float n) {
-	return from_stepping_space(to_stepping_space(t, cone_angle) + n, cone_angle);
-}
-
-inline __host__ __device__ float calc_dt(float t, float cone_angle) {
-	return advance_n_steps(t, cone_angle, 1.0f) - t;
-}
-
-struct LossAndGradient {
-	vec3 loss;
-	vec3 gradient;
-
-	__host__ __device__ LossAndGradient operator*(float scalar) {
-		return {loss * scalar, gradient * scalar};
-	}
-
-	__host__ __device__ LossAndGradient operator/(float scalar) {
-		return {loss / scalar, gradient / scalar};
-	}
-};
-
-inline __device__ vec3 copysign(const vec3& a, const vec3& b) {
-	return {
-		copysignf(a.x, b.x),
-		copysignf(a.y, b.y),
-		copysignf(a.z, b.z),
-	};
-}
-
-inline __device__ LossAndGradient l2_loss(const vec3& target, const vec3& prediction) {
-	vec3 difference = prediction - target;
-	return {
-		difference * difference,
-		2.0f * difference
-	};
-}
-
-inline __device__ LossAndGradient relative_l2_loss(const vec3& target, const vec3& prediction) {
-	vec3 difference = prediction - target;
-	vec3 denom = prediction * prediction + vec3(1e-2f);
-	return {
-		difference * difference / denom,
-		2.0f * difference / denom
-	};
-}
-
-inline __device__ LossAndGradient l1_loss(const vec3& target, const vec3& prediction) {
-	vec3 difference = prediction - target;
-	return {
-		abs(difference),
-		copysign(vec3(1.0f), difference),
-	};
-}
-
-inline __device__ LossAndGradient huber_loss(const vec3& target, const vec3& prediction, float alpha = 1) {
-	vec3 difference = prediction - target;
-	vec3 abs_diff = abs(difference);
-	vec3 square = 0.5f/alpha * difference * difference;
-	return {
-		{
-			abs_diff.x > alpha ? (abs_diff.x - 0.5f * alpha) : square.x,
-			abs_diff.y > alpha ? (abs_diff.y - 0.5f * alpha) : square.y,
-			abs_diff.z > alpha ? (abs_diff.z - 0.5f * alpha) : square.z,
-		},
-		{
-			abs_diff.x > alpha ? (difference.x > 0 ? 1.0f : -1.0f) : (difference.x / alpha),
-			abs_diff.y > alpha ? (difference.y > 0 ? 1.0f : -1.0f) : (difference.y / alpha),
-			abs_diff.z > alpha ? (difference.z > 0 ? 1.0f : -1.0f) : (difference.z / alpha),
-		},
-	};
-}
-
-inline __device__ LossAndGradient log_l1_loss(const vec3& target, const vec3& prediction) {
-	vec3 difference = prediction - target;
-	vec3 divisor = abs(difference) + vec3(1.0f);
-	return {
-		log(divisor),
-		copysign(vec3(1.0f) / divisor, difference),
-	};
-}
-
-inline __device__ LossAndGradient smape_loss(const vec3& target, const vec3& prediction) {
-	vec3 difference = prediction - target;
-	vec3 denom = 0.5f * (abs(prediction) + abs(target)) + vec3(1e-2f);
-	return {
-		abs(difference) / denom,
-		copysign(vec3(1.0f) / denom, difference),
-	};
-}
-
-inline __device__ LossAndGradient mape_loss(const vec3& target, const vec3& prediction) {
-	vec3 difference = prediction - target;
-	vec3 denom = abs(prediction) + vec3(1e-2f);
-	return {
-		abs(difference) / denom,
-		copysign(vec3(1.0f) / denom, difference),
-	};
-}
-
-inline __device__ float distance_to_next_voxel(const vec3& pos, const vec3& dir, const vec3& idir, float res) { // dda like step
-	vec3 p = res * (pos - vec3(0.5f));
-	float tx = (floorf(p.x + 0.5f + 0.5f * sign(dir.x)) - p.x) * idir.x;
-	float ty = (floorf(p.y + 0.5f + 0.5f * sign(dir.y)) - p.y) * idir.y;
-	float tz = (floorf(p.z + 0.5f + 0.5f * sign(dir.z)) - p.z) * idir.z;
-	float t = min(min(tx, ty), tz);
-
-	return fmaxf(t / res, 0.0f);
-}
-
-inline __device__ float advance_to_next_voxel(float t, float cone_angle, const vec3& pos, const vec3& dir, const vec3& idir, uint32_t mip) {
-	float res = scalbnf(NERF_GRIDSIZE(), -(int)mip);
-
-	float t_target = t + distance_to_next_voxel(pos, dir, idir, res);
-
-	// Analytic stepping in multiples of 1 in the "log-space" of our exponential stepping routine
-	t = to_stepping_space(t, cone_angle);
-	t_target = to_stepping_space(t_target, cone_angle);
-
-	return from_stepping_space(t + ceilf(fmaxf(t_target - t, 0.5f)), cone_angle);
-}
-
-__device__ float network_to_rgb(float val, ENerfActivation activation) {
-	switch (activation) {
-		case ENerfActivation::None: return val;
-		case ENerfActivation::ReLU: return val > 0.0f ? val : 0.0f;
-		case ENerfActivation::Logistic: return tcnn::logistic(val);
-		case ENerfActivation::Exponential: return __expf(tcnn::clamp(val, -10.0f, 10.0f));
-		default: assert(false);
-	}
-	return 0.0f;
-}
-
-__device__ float network_to_rgb_derivative(float val, ENerfActivation activation) {
-	switch (activation) {
-		case ENerfActivation::None: return 1.0f;
-		case ENerfActivation::ReLU: return val > 0.0f ? 1.0f : 0.0f;
-		case ENerfActivation::Logistic: { float density = tcnn::logistic(val); return density * (1 - density); };
-		case ENerfActivation::Exponential: return __expf(tcnn::clamp(val, -10.0f, 10.0f));
-		default: assert(false);
-	}
-	return 0.0f;
-}
-
-template <typename T>
-__device__ vec3 network_to_rgb_derivative_vec(const T& val, ENerfActivation activation) {
-	return {
-		network_to_rgb_derivative(float(val[0]), activation),
-		network_to_rgb_derivative(float(val[1]), activation),
-		network_to_rgb_derivative(float(val[2]), activation),
-	};
-}
-
-__device__ float network_to_density(float val, ENerfActivation activation) {
-	switch (activation) {
-		case ENerfActivation::None: return val;
-		case ENerfActivation::ReLU: return val > 0.0f ? val : 0.0f;
-		case ENerfActivation::Logistic: return tcnn::logistic(val);
-		case ENerfActivation::Exponential: return __expf(val);
-		default: assert(false);
-	}
-	return 0.0f;
-}
-
-__device__ float network_to_density_derivative(float val, ENerfActivation activation) {
-	switch (activation) {
-		case ENerfActivation::None: return 1.0f;
-		case ENerfActivation::ReLU: return val > 0.0f ? 1.0f : 0.0f;
-		case ENerfActivation::Logistic: { float density = tcnn::logistic(val); return density * (1 - density); };
-		case ENerfActivation::Exponential: return __expf(tcnn::clamp(val, -15.0f, 15.0f));
-		default: assert(false);
-	}
-	return 0.0f;
-}
-
-template <typename T>
-__device__ vec3 network_to_rgb_vec(const T& val, ENerfActivation activation) {
-	return {
-		network_to_rgb(float(val[0]), activation),
-		network_to_rgb(float(val[1]), activation),
-		network_to_rgb(float(val[2]), activation),
-	};
-}
-
-__device__ vec3 warp_position(const vec3& pos, const BoundingBox& aabb) {
-	// return {tcnn::logistic(pos.x - 0.5f), tcnn::logistic(pos.y - 0.5f), tcnn::logistic(pos.z - 0.5f)};
-	// return pos;
-
-	return aabb.relative_pos(pos);
-}
-
-__device__ vec3 unwarp_position(const vec3& pos, const BoundingBox& aabb) {
-	// return {logit(pos.x) + 0.5f, logit(pos.y) + 0.5f, logit(pos.z) + 0.5f};
-	// return pos;
-
-	return aabb.min + pos * aabb.diag();
-}
-
-__device__ vec3 unwarp_position_derivative(const vec3& pos, const BoundingBox& aabb) {
-	// return {logit(pos.x) + 0.5f, logit(pos.y) + 0.5f, logit(pos.z) + 0.5f};
-	// return pos;
-
-	return aabb.diag();
-}
-
-__device__ vec3 warp_position_derivative(const vec3& pos, const BoundingBox& aabb) {
-	return vec3(1.0f) / unwarp_position_derivative(pos, aabb);
-}
-
-__host__ __device__ vec3 warp_direction(const vec3& dir) {
-	return (dir + vec3(1.0f)) * 0.5f;
-}
-
-__device__ vec3 unwarp_direction(const vec3& dir) {
-	return dir * 2.0f - vec3(1.0f);
-}
-
-__device__ vec3 warp_direction_derivative(const vec3& dir) {
-	return vec3(0.5f);
-}
-
-__device__ vec3 unwarp_direction_derivative(const vec3& dir) {
-	return vec3(2.0f);
-}
-
-__device__ float warp_dt(float dt) {
-	float max_stepsize = MIN_CONE_STEPSIZE() * (1<<(NERF_CASCADES()-1));
-	return (dt - MIN_CONE_STEPSIZE()) / (max_stepsize - MIN_CONE_STEPSIZE());
-}
-
-__device__ float unwarp_dt(float dt) {
-	float max_stepsize = MIN_CONE_STEPSIZE() * (1<<(NERF_CASCADES()-1));
-	return dt * (max_stepsize - MIN_CONE_STEPSIZE()) + MIN_CONE_STEPSIZE();
-}
-
-__device__ uint32_t cascaded_grid_idx_at(vec3 pos, uint32_t mip) {
-	float mip_scale = scalbnf(1.0f, -mip);
-	pos -= vec3(0.5f);
-	pos *= mip_scale;
-	pos += vec3(0.5f);
-
-	ivec3 i = pos * (float)NERF_GRIDSIZE();
-	if (i.x < 0 || i.x >= NERF_GRIDSIZE() || i.y < 0 || i.y >= NERF_GRIDSIZE() || i.z < 0 || i.z >= NERF_GRIDSIZE()) {
-		return 0xFFFFFFFF;
-	}
-
-	return tcnn::morton3D(i.x, i.y, i.z);
-}
-
-__device__ bool density_grid_occupied_at(const vec3& pos, const uint8_t* density_grid_bitfield, uint32_t mip) {
-	uint32_t idx = cascaded_grid_idx_at(pos, mip);
-	if (idx == 0xFFFFFFFF) {
-		return false;
-	}
-	return density_grid_bitfield[idx/8+grid_mip_offset(mip)/8] & (1<<(idx%8));
-}
-
-__device__ float cascaded_grid_at(vec3 pos, const float* cascaded_grid, uint32_t mip) {
-	uint32_t idx = cascaded_grid_idx_at(pos, mip);
-	if (idx == 0xFFFFFFFF) {
-		return 0.0f;
-	}
-	return cascaded_grid[idx+grid_mip_offset(mip)];
-}
-
-__device__ float& cascaded_grid_at(vec3 pos, float* cascaded_grid, uint32_t mip) {
-	uint32_t idx = cascaded_grid_idx_at(pos, mip);
-	if (idx == 0xFFFFFFFF) {
-		idx = 0;
-		printf("WARNING: invalid cascaded grid access.");
-	}
-	return cascaded_grid[idx+grid_mip_offset(mip)];
-}
-
 __global__ void extract_srgb_with_activation(const uint32_t n_elements,	const uint32_t rgb_stride, const float* __restrict__ rgbd, float* __restrict__ rgb, ENerfActivation rgb_activation, bool from_linear) {
 	const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
 	if (i >= n_elements) return;
@@ -432,12 +83,12 @@ __global__ void mark_untrained_density_grid(const uint32_t n_elements,  float* _
 	uint32_t level = i / NERF_GRID_N_CELLS();
 	uint32_t pos_idx = i % NERF_GRID_N_CELLS();
 
-	uint32_t x = tcnn::morton3D_invert(pos_idx>>0);
-	uint32_t y = tcnn::morton3D_invert(pos_idx>>1);
-	uint32_t z = tcnn::morton3D_invert(pos_idx>>2);
+	uint32_t x = morton3D_invert(pos_idx>>0);
+	uint32_t y = morton3D_invert(pos_idx>>1);
+	uint32_t z = morton3D_invert(pos_idx>>2);
 
 	float voxel_size = scalbnf(1.0f / NERF_GRIDSIZE(), level);
-	vec3 pos = (vec3{(float)x, (float)y, (float)z} / (float)NERF_GRIDSIZE() - vec3(0.5f)) * scalbnf(1.0f, level) + vec3(0.5f);
+	vec3 pos = (vec3{(float)x, (float)y, (float)z} / (float)NERF_GRIDSIZE() - 0.5f) * scalbnf(1.0f, level) + 0.5f;
 
 	vec3 corners[8] = {
 		pos + vec3{0.0f,       0.0f,       0.0f      },
@@ -503,7 +154,7 @@ __global__ void generate_grid_samples_nerf_uniform(ivec3 res_3d, const uint32_t
 	}
 
 	uint32_t i = x + y * res_3d.x + z * res_3d.x * res_3d.y;
-	vec3 pos = vec3{(float)x, (float)y, (float)z} / vec3(res_3d - ivec3(1));
+	vec3 pos = vec3{(float)x, (float)y, (float)z} / vec3(res_3d - 1);
 	pos = transpose(render_aabb_to_local) * (pos * (render_aabb.max - render_aabb.min) + render_aabb.min);
 	out[i] = { warp_position(pos, train_aabb), warp_dt(MIN_CONE_STEPSIZE()) };
 }
@@ -523,7 +174,7 @@ __global__ void generate_grid_samples_nerf_uniform_dir(ivec3 res_3d, const uint3
 	if (voxel_centers) {
 		pos = vec3{(float)x + 0.5f, (float)y + 0.5f, (float)z + 0.5f} / vec3(res_3d);
 	} else {
-		pos = vec3{(float)x, (float)y, (float)z} / vec3(res_3d - ivec3(1));
+		pos = vec3{(float)x, (float)y, (float)z} / vec3(res_3d - 1);
 	}
 
 	pos = transpose(render_aabb_to_local) * (pos * (render_aabb.max - render_aabb.min) + render_aabb.min);
@@ -531,25 +182,6 @@ __global__ void generate_grid_samples_nerf_uniform_dir(ivec3 res_3d, const uint3
 	network_input(i)->set_with_optional_extra_dims(warp_position(pos, train_aabb), warp_direction(ray_dir), warp_dt(MIN_CONE_STEPSIZE()), extra_dims, network_input.stride_in_bytes);
 }
 
-inline __device__ uint32_t mip_from_pos(const vec3& pos, uint32_t max_cascade = NERF_CASCADES()-1) {
-	int exponent;
-	float maxval = compMax(abs(pos - vec3(0.5f)));
-	frexpf(maxval, &exponent);
-	return (uint32_t)tcnn::clamp(exponent+1, 0, (int)max_cascade);
-}
-
-inline __device__ uint32_t mip_from_dt(float dt, const vec3& pos, uint32_t max_cascade = NERF_CASCADES()-1) {
-	uint32_t mip = mip_from_pos(pos, max_cascade);
-	dt *= 2 * NERF_GRIDSIZE();
-	if (dt < 1.0f) {
-		return mip;
-	}
-
-	int exponent;
-	frexpf(dt, &exponent);
-	return (uint32_t)tcnn::clamp((int)mip, exponent, (int)max_cascade);
-}
-
 __global__ void generate_grid_samples_nerf_nonuniform(const uint32_t n_elements, default_rng_t rng, const uint32_t step, BoundingBox aabb, const float* __restrict__ grid_in, NerfPosition* __restrict__ out, uint32_t* __restrict__ indices, uint32_t n_cascades, float thresh) {
 	const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
 	if (i >= n_elements) return;
@@ -571,17 +203,17 @@ __global__ void generate_grid_samples_nerf_nonuniform(const uint32_t n_elements,
 	// Random position within that cellq
 	uint32_t pos_idx = idx % NERF_GRID_N_CELLS();
 
-	uint32_t x = tcnn::morton3D_invert(pos_idx>>0);
-	uint32_t y = tcnn::morton3D_invert(pos_idx>>1);
-	uint32_t z = tcnn::morton3D_invert(pos_idx>>2);
+	uint32_t x = morton3D_invert(pos_idx>>0);
+	uint32_t y = morton3D_invert(pos_idx>>1);
+	uint32_t z = morton3D_invert(pos_idx>>2);
 
-	vec3 pos = ((vec3{(float)x, (float)y, (float)z} + random_val_3d(rng)) / (float)NERF_GRIDSIZE() - vec3(0.5f)) * scalbnf(1.0f, level) + vec3(0.5f);
+	vec3 pos = ((vec3{(float)x, (float)y, (float)z} + random_val_3d(rng)) / (float)NERF_GRIDSIZE() - 0.5f) * scalbnf(1.0f, level) + 0.5f;
 
 	out[i] = { warp_position(pos, aabb), warp_dt(MIN_CONE_STEPSIZE()) };
 	indices[i] = idx;
 }
 
-__global__ void splat_grid_samples_nerf_max_nearest_neighbor(const uint32_t n_elements, const uint32_t* __restrict__ indices, const tcnn::network_precision_t* network_output, float* __restrict__ grid_out, ENerfActivation rgb_activation, ENerfActivation density_activation) {
+__global__ void splat_grid_samples_nerf_max_nearest_neighbor(const uint32_t n_elements, const uint32_t* __restrict__ indices, const network_precision_t* network_output, float* __restrict__ grid_out, ENerfActivation rgb_activation, ENerfActivation density_activation) {
 	const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
 	if (i >= n_elements) return;
 
@@ -599,7 +231,7 @@ __global__ void splat_grid_samples_nerf_max_nearest_neighbor(const uint32_t n_el
 	atomicMax((uint32_t*)&grid_out[local_idx], __float_as_uint(optical_thickness));
 }
 
-__global__ void grid_samples_half_to_float(const uint32_t n_elements, BoundingBox aabb, float* dst, const tcnn::network_precision_t* network_output, ENerfActivation density_activation, const NerfPosition* __restrict__ coords_in, const float* __restrict__ grid_in, uint32_t max_cascade) {
+__global__ void grid_samples_half_to_float(const uint32_t n_elements, BoundingBox aabb, float* dst, const network_precision_t* network_output, ENerfActivation density_activation, const NerfPosition* __restrict__ coords_in, const float* __restrict__ grid_in, uint32_t max_cascade) {
 	const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
 	if (i >= n_elements) return;
 
@@ -691,46 +323,11 @@ __global__ void bitfield_max_pool(const uint32_t n_elements,
 		bits |= prev_level[i*8+j] > 0 ? ((uint8_t)1 << j) : 0;
 	}
 
-	uint32_t x = tcnn::morton3D_invert(i>>0) + NERF_GRIDSIZE()/8;
-	uint32_t y = tcnn::morton3D_invert(i>>1) + NERF_GRIDSIZE()/8;
-	uint32_t z = tcnn::morton3D_invert(i>>2) + NERF_GRIDSIZE()/8;
-
-	next_level[tcnn::morton3D(x, y, z)] |= bits;
-}
-
-template <bool MIP_FROM_DT=false>
-__device__ float if_unoccupied_advance_to_next_occupied_voxel(
-	float t,
-	float cone_angle,
-	const Ray& ray,
-	const vec3& idir,
-	const uint8_t* __restrict__ density_grid,
-	uint32_t min_mip,
-	uint32_t max_mip,
-	BoundingBox aabb,
-	mat3 aabb_to_local = mat3(1.0f)
-) {
-	while (true) {
-		vec3 pos = ray(t);
-		if (t >= MAX_DEPTH() || !aabb.contains(aabb_to_local * pos)) {
-			return MAX_DEPTH();
-		}
-
-		uint32_t mip = tcnn::clamp(MIP_FROM_DT ? mip_from_dt(calc_dt(t, cone_angle), pos) : mip_from_pos(pos), min_mip, max_mip);
-
-		if (!density_grid || density_grid_occupied_at(pos, density_grid, mip)) {
-			return t;
-		}
+	uint32_t x = morton3D_invert(i>>0) + NERF_GRIDSIZE()/8;
+	uint32_t y = morton3D_invert(i>>1) + NERF_GRIDSIZE()/8;
+	uint32_t z = morton3D_invert(i>>2) + NERF_GRIDSIZE()/8;
 
-		// Find largest empty voxel surrounding us, such that we can advance as far as possible in the next step.
-		// Other places that do voxel stepping don't need this, because they don't rely on thread coherence as
-		// much as this one here.
-		while (mip < max_mip && !density_grid_occupied_at(pos, density_grid, mip+1)) {
-			++mip;
-		}
-
-		t = advance_to_next_voxel(t, cone_angle, pos, ray.d, idir, mip);
-	}
+	next_level[morton3D(x, y, z)] |= bits;
 }
 
 __device__ void advance_pos_nerf(
@@ -787,7 +384,7 @@ __global__ void generate_nerf_network_inputs_from_positions(const uint32_t n_ele
 	const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
 	if (i >= n_elements) return;
 
-	vec3 dir = normalize(pos[i] - vec3(0.5f)); // choose outward pointing directions, for want of a better choice
+	vec3 dir = normalize(pos[i] - 0.5f); // choose outward pointing directions, for want of a better choice
 	network_input(i)->set_with_optional_extra_dims(warp_position(pos[i], aabb), warp_direction(dir), warp_dt(MIN_CONE_STEPSIZE()), extra_dims, network_input.stride_in_bytes);
 }
 
@@ -807,10 +404,10 @@ __device__ vec4 compute_nerf_rgba(const vec4& network_output, ENerfActivation rg
 	if (density_as_alpha) {
 		rgba.a = density;
 	} else {
-		rgba.a = alpha = tcnn::clamp(1.f - __expf(-density * depth), 0.0f, 1.0f);
+		rgba.a = alpha = clamp(1.f - __expf(-density * depth), 0.0f, 1.0f);
 	}
 
-	rgba.rgb = network_to_rgb_vec(rgba.rgb, rgb_activation) * alpha;
+	rgba.rgb() = network_to_rgb_vec(rgba.rgb(), rgb_activation) * alpha;
 	return rgba;
 }
 
@@ -884,7 +481,7 @@ __global__ void composite_kernel_nerf(
 	float* __restrict__ depth,
 	NerfPayload* payloads,
 	PitchedPtr<NerfCoordinate> network_input,
-	const tcnn::network_precision_t* __restrict__ network_output,
+	const network_precision_t* __restrict__ network_output,
 	uint32_t padded_output_width,
 	uint32_t n_steps,
 	ERenderMode render_mode,
@@ -912,7 +509,7 @@ __global__ void composite_kernel_nerf(
 	uint32_t j = 0;
 
 	for (; j < actual_n_steps; ++j) {
-		tcnn::vector_t<tcnn::network_precision_t, 4> local_network_output;
+		tvec<network_precision_t, 4> local_network_output;
 		local_network_output[0] = network_output[i + j * n_elements + 0 * stride];
 		local_network_output[1] = network_output[i + j * n_elements + 1 * stride];
 		local_network_output[2] = network_output[i + j * n_elements + 2 * stride];
@@ -1036,19 +633,7 @@ __global__ void composite_kernel_nerf(
 			vec3 normal = -network_to_density_derivative(float(local_network_output[3]), density_activation) * warped_pos;
 			rgb = normalize(normal);
 		} else if (render_mode == ERenderMode::Positions) {
-			if (show_accel >= 0) {
-				uint32_t mip = max(show_accel, mip_from_pos(pos));
-				uint32_t res = NERF_GRIDSIZE() >> mip;
-				int ix = pos.x * res;
-				int iy = pos.y * res;
-				int iz = pos.z * res;
-				default_rng_t rng(ix + iy * 232323 + iz * 727272);
-				rgb.x = 1.f - mip * (1.f / (NERF_CASCADES() - 1));
-				rgb.y = rng.next_float();
-				rgb.z = rng.next_float();
-			} else {
-				rgb = (pos - vec3(0.5f)) / 2.0f + vec3(0.5f);
-			}
+			rgb = (pos - 0.5f) / 2.0f + 0.5f;
 		} else if (render_mode == ERenderMode::EncodingVis) {
 			rgb = warped_pos;
 		} else if (render_mode == ERenderMode::Depth) {
@@ -1057,6 +642,18 @@ __global__ void composite_kernel_nerf(
 			rgb = vec3(alpha);
 		}
 
+		if (show_accel >= 0) {
+			uint32_t mip = max((uint32_t)show_accel, mip_from_pos(pos));
+			uint32_t res = NERF_GRIDSIZE() >> mip;
+			int ix = pos.x * res;
+			int iy = pos.y * res;
+			int iz = pos.z * res;
+			default_rng_t rng(ix + iy * 232323 + iz * 727272);
+			rgb.x = 1.f - mip * (1.f / (NERF_CASCADES() - 1));
+			rgb.y = rng.next_float();
+			rgb.z = rng.next_float();
+		}
+
 		local_rgba += vec4(rgb * weight, weight);
 		if (weight > payload.max_weight) {
 			payload.max_weight = weight;
@@ -1078,119 +675,6 @@ __global__ void composite_kernel_nerf(
 	depth[i] = local_depth;
 }
 
-static constexpr float UNIFORM_SAMPLING_FRACTION = 0.5f;
-
-inline __device__ vec2 sample_cdf_2d(vec2 sample, uint32_t img, const ivec2& res, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y, float* __restrict__ pdf) {
-	if (sample.x < UNIFORM_SAMPLING_FRACTION) {
-		sample.x /= UNIFORM_SAMPLING_FRACTION;
-		return sample;
-	}
-
-	sample.x = (sample.x - UNIFORM_SAMPLING_FRACTION) / (1.0f - UNIFORM_SAMPLING_FRACTION);
-
-	cdf_y += img * res.y;
-
-	// First select row according to cdf_y
-	uint32_t y = binary_search(sample.y, cdf_y, res.y);
-	float prev = y > 0 ? cdf_y[y-1] : 0.0f;
-	float pmf_y = cdf_y[y] - prev;
-	sample.y = (sample.y - prev) / pmf_y;
-
-	cdf_x_cond_y += img * res.y * res.x + y * res.x;
-
-	// Then, select col according to x
-	uint32_t x = binary_search(sample.x, cdf_x_cond_y, res.x);
-	prev = x > 0 ? cdf_x_cond_y[x-1] : 0.0f;
-	float pmf_x = cdf_x_cond_y[x] - prev;
-	sample.x = (sample.x - prev) / pmf_x;
-
-	if (pdf) {
-		*pdf = pmf_x * pmf_y * compMul(res);
-	}
-
-	return {((float)x + sample.x) / (float)res.x, ((float)y + sample.y) / (float)res.y};
-}
-
-inline __device__ float pdf_2d(vec2 sample, uint32_t img, const ivec2& res, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y) {
-	ivec2 p = clamp(ivec2(sample * vec2(res)), ivec2(0), res - ivec2(1));
-
-	cdf_y += img * res.y;
-	cdf_x_cond_y += img * res.y * res.x + p.y * res.x;
-
-	float pmf_y = cdf_y[p.y];
-	if (p.y > 0) {
-		pmf_y -= cdf_y[p.y-1];
-	}
-
-	float pmf_x = cdf_x_cond_y[p.x];
-	if (p.x > 0) {
-		pmf_x -= cdf_x_cond_y[p.x-1];
-	}
-
-	// Probability mass of picking the pixel
-	float pmf = pmf_x * pmf_y;
-
-	// To convert to probability density, divide by area of pixel
-	return UNIFORM_SAMPLING_FRACTION + pmf * compMul(res) * (1.0f - UNIFORM_SAMPLING_FRACTION);
-}
-
-inline __device__ vec2 nerf_random_image_pos_training(default_rng_t& rng, const ivec2& resolution, bool snap_to_pixel_centers, const float* __restrict__ cdf_x_cond_y, const float* __restrict__ cdf_y, const ivec2& cdf_res, uint32_t img, float* __restrict__ pdf = nullptr) {
-	vec2 uv = random_val_2d(rng);
-
-	if (cdf_x_cond_y) {
-		uv = sample_cdf_2d(uv, img, cdf_res, cdf_x_cond_y, cdf_y, pdf);
-	} else if (pdf) {
-		*pdf = 1.0f;
-	}
-
-	if (snap_to_pixel_centers) {
-		uv = (vec2(clamp(ivec2(uv * vec2(resolution)), ivec2(0), resolution - ivec2(1))) + vec2(0.5f)) / vec2(resolution);
-	}
-
-	return uv;
-}
-
-inline __device__ uint32_t image_idx(uint32_t base_idx, uint32_t n_rays, uint32_t n_rays_total, uint32_t n_training_images, const float* __restrict__ cdf = nullptr, float* __restrict__ pdf = nullptr) {
-	if (cdf) {
-		float sample = ld_random_val(base_idx/* + n_rays_total*/, 0xdeadbeef);
-		// float sample = random_val(base_idx/* + n_rays_total*/);
-		uint32_t img = binary_search(sample, cdf, n_training_images);
-
-		if (pdf) {
-			float prev = img > 0 ? cdf[img-1] : 0.0f;
-			*pdf = (cdf[img] - prev) * n_training_images;
-		}
-
-		return img;
-	}
-
-	// return ((base_idx/* + n_rays_total*/) * 56924617 + 96925573) % n_training_images;
-
-	// Neighboring threads in the warp process the same image. Increases locality.
-	if (pdf) {
-		*pdf = 1.0f;
-	}
-	return (((base_idx/* + n_rays_total*/) * n_training_images) / n_rays) % n_training_images;
-}
-
-__device__ LossAndGradient loss_and_gradient(const vec3& target, const vec3& prediction, ELossType loss_type) {
-	switch (loss_type) {
-		case ELossType::RelativeL2:  return relative_l2_loss(target, prediction); break;
-		case ELossType::L1:          return l1_loss(target, prediction); break;
-		case ELossType::Mape:        return mape_loss(target, prediction); break;
-		case ELossType::Smape:       return smape_loss(target, prediction); break;
-		// Note: we divide the huber loss by a factor of 5 such that its L2 region near zero
-		// matches with the L2 loss and error numbers become more comparable. This allows reading
-		// off dB numbers of ~converged models and treating them as approximate PSNR to compare
-		// with other NeRF methods. Self-normalizing optimizers such as Adam are agnostic to such
-		// constant factors; optimization is therefore unaffected.
-		case ELossType::Huber:       return huber_loss(target, prediction, 0.1f) / 5.0f; break;
-		case ELossType::LogL1:       return log_l1_loss(target, prediction); break;
-		default: case ELossType::L2: return l2_loss(target, prediction); break;
-	}
-}
-
-
 __global__ void generate_training_samples_nerf(
 	const uint32_t n_rays,
 	BoundingBox aabb,
@@ -1366,14 +850,14 @@ __global__ void compute_loss_kernel_train_nerf(
 	bool train_in_linear_colors,
 	const uint32_t n_training_images,
 	const TrainingImageMetadata* __restrict__ metadata,
-	const tcnn::network_precision_t* network_output,
+	const network_precision_t* network_output,
 	uint32_t* __restrict__ numsteps_counter,
 	const uint32_t* __restrict__ ray_indices_in,
 	const Ray* __restrict__ rays_in_unnormalized,
 	uint32_t* __restrict__ numsteps_in,
 	PitchedPtr<const NerfCoordinate> coords_in,
 	PitchedPtr<NerfCoordinate> coords_out,
-	tcnn::network_precision_t* dloss_doutput,
+	network_precision_t* dloss_doutput,
 	ELossType loss_type,
 	ELossType depth_loss_type,
 	float* __restrict__ loss_output,
@@ -1424,7 +908,7 @@ __global__ void compute_loss_kernel_train_nerf(
 			break;
 		}
 
-		const tcnn::vector_t<tcnn::network_precision_t, 4> local_network_output = *(tcnn::vector_t<tcnn::network_precision_t, 4>*)network_output;
+		const tvec<network_precision_t, 4> local_network_output = *(tvec<network_precision_t, 4>*)network_output;
 		const vec3 rgb = network_to_rgb_vec(local_network_output, rgb_activation);
 		const vec3 pos = unwarp_position(coords_in.ptr->pos.p, aabb);
 		const float dt = unwarp_dt(coords_in.ptr->dt);
@@ -1469,7 +953,7 @@ __global__ void compute_loss_kernel_train_nerf(
 	if (envmap) {
 		dir = normalize(rays_in_unnormalized[i].d);
 		envmap_value = read_envmap(envmap, dir);
-		background_color = envmap_value.rgb + background_color * (1.0f - envmap_value.a);
+		background_color = envmap_value.rgb() + background_color * (1.0f - envmap_value.a);
 	}
 
 	vec3 exposure_scale = exp(0.6931471805599453f * exposure[img]);
@@ -1479,7 +963,7 @@ __global__ void compute_loss_kernel_train_nerf(
 
 	vec3 rgbtarget;
 	if (train_in_linear_colors || color_space == EColorSpace::Linear) {
-		rgbtarget = exposure_scale * texsamp.rgb + (1.0f - texsamp.a) * background_color;
+		rgbtarget = exposure_scale * texsamp.rgb() + (1.0f - texsamp.a) * background_color;
 
 		if (!train_in_linear_colors) {
 			rgbtarget = linear_to_srgb(rgbtarget);
@@ -1488,7 +972,7 @@ __global__ void compute_loss_kernel_train_nerf(
 	} else if (color_space == EColorSpace::SRGB) {
 		background_color = linear_to_srgb(background_color);
 		if (texsamp.a > 0) {
-			rgbtarget = linear_to_srgb(exposure_scale * texsamp.rgb / texsamp.a) * texsamp.a + (1.0f - texsamp.a) * background_color;
+			rgbtarget = linear_to_srgb(exposure_scale * texsamp.rgb() / texsamp.a) * texsamp.a + (1.0f - texsamp.a) * background_color;
 		} else {
 			rgbtarget = background_color;
 		}
@@ -1529,25 +1013,25 @@ __global__ void compute_loss_kernel_train_nerf(
 	// to change the weighting of the loss function. So don't divide.
 	// lg.gradient /= img_pdf * uv_pdf;
 
-	float mean_loss = compAdd(lg.loss) / 3.0f;
+	float mean_loss = mean(lg.loss);
 	if (loss_output) {
 		loss_output[i] = mean_loss / (float)n_rays;
 	}
 
 	if (error_map) {
-		const vec2 pos = clamp(uv * vec2(error_map_res) - vec2(0.5f), vec2(0.0f), vec2(error_map_res) - vec2(1.0f + 1e-4f));
+		const vec2 pos = clamp(uv * vec2(error_map_res) - 0.5f, 0.0f, vec2(error_map_res) - (1.0f + 1e-4f));
 		const ivec2 pos_int = pos;
 		const vec2 weight = pos - vec2(pos_int);
 
-		ivec2 idx = clamp(pos_int, ivec2(0), resolution - ivec2(2));
+		ivec2 idx = clamp(pos_int, 0, resolution - 2);
 
 		auto deposit_val = [&](int x, int y, float val) {
-			atomicAdd(&error_map[img * compMul(error_map_res) + y * error_map_res.x + x], val);
+			atomicAdd(&error_map[img * product(error_map_res) + y * error_map_res.x + x], val);
 		};
 
 		if (sharpness_data && aabb.contains(hitpoint)) {
-			ivec2 sharpness_pos = clamp(ivec2(uv * vec2(sharpness_resolution)), ivec2(0), sharpness_resolution - ivec2(1));
-			float sharp = sharpness_data[img * compMul(sharpness_resolution) + sharpness_pos.y * sharpness_resolution.x + sharpness_pos.x] + 1e-6f;
+			ivec2 sharpness_pos = clamp(ivec2(uv * vec2(sharpness_resolution)), 0, sharpness_resolution - 1);
+			float sharp = sharpness_data[img * product(sharpness_resolution) + sharpness_pos.y * sharpness_resolution.x + sharpness_pos.x] + 1e-6f;
 
 			// The maximum value of positive floats interpreted in uint format is the same as the maximum value of the floats.
 			float grid_sharp = __uint_as_float(atomicMax((uint32_t*)&cascaded_grid_at(hitpoint, sharpness_grid, mip_from_pos(hitpoint, max_mip)), __float_as_uint(sharp)));
@@ -1584,7 +1068,7 @@ __global__ void compute_loss_kernel_train_nerf(
 		float depth = distance(pos, ray_o);
 
 		float dt = unwarp_dt(coord_in->dt);
-		const tcnn::vector_t<tcnn::network_precision_t, 4> local_network_output = *(tcnn::vector_t<tcnn::network_precision_t, 4>*)network_output;
+		const tvec<network_precision_t, 4> local_network_output = *(tvec<network_precision_t, 4>*)network_output;
 		const vec3 rgb = network_to_rgb_vec(local_network_output, rgb_activation);
 		const float density = network_to_density(float(local_network_output[3]), density_activation);
 		const float alpha = 1.f - __expf(-density * dt);
@@ -1597,7 +1081,7 @@ __global__ void compute_loss_kernel_train_nerf(
 		const vec3 suffix = rgb_ray - rgb_ray2;
 		const vec3 dloss_by_drgb = weight * lg.gradient;
 
-		tcnn::vector_t<tcnn::network_precision_t, 4> local_dL_doutput;
+		tvec<network_precision_t, 4> local_dL_doutput;
 
 		// chain rule to go from dloss/drgb to dloss/dmlp_output
 		local_dL_doutput[0] = loss_scale * (dloss_by_drgb.x * network_to_rgb_derivative(local_network_output[0], rgb_activation) + fmaxf(0.0f, output_l2_reg * (float)local_network_output[0])); // Penalize way too large color values
@@ -1621,7 +1105,7 @@ __global__ void compute_loss_kernel_train_nerf(
 			(float(local_network_output[3]) > -10.0f && depth < near_distance ? 1e-4f : 0.0f);
 			;
 
-		*(tcnn::vector_t<tcnn::network_precision_t, 4>*)dloss_doutput = local_dL_doutput;
+		*(tvec<network_precision_t, 4>*)dloss_doutput = local_dL_doutput;
 
 		dloss_doutput += padded_output_width;
 		network_output += padded_output_width;
@@ -1653,7 +1137,7 @@ __global__ void compute_loss_kernel_train_nerf(
 			dloss_by_dbackground /= srgb_to_linear_derivative(background_color);
 		}
 
-		tcnn::vector_t<tcnn::network_precision_t, 4> dL_denvmap;
+		tvec<network_precision_t, 4> dL_denvmap;
 		dL_denvmap[0] = loss_scale * dloss_by_dbackground.x;
 		dL_denvmap[1] = loss_scale * dloss_by_dbackground.y;
 		dL_denvmap[2] = loss_scale * dloss_by_dbackground.z;
@@ -1662,7 +1146,7 @@ __global__ void compute_loss_kernel_train_nerf(
 		float dloss_by_denvmap_alpha = -dot(dloss_by_dbackground, pre_envmap_background_color);
 
 		// dL_denvmap[3] = loss_scale * dloss_by_denvmap_alpha;
-		dL_denvmap[3] = (tcnn::network_precision_t)0;
+		dL_denvmap[3] = (network_precision_t)0;
 
 		deposit_envmap_gradient(dL_denvmap, envmap_gradient, envmap_resolution, dir);
 	}
@@ -1817,6 +1301,9 @@ __global__ void compute_extra_dims_gradient_train_nerf(
 
 __global__ void shade_kernel_nerf(
 	const uint32_t n_elements,
+	bool gbuffer_hard_edges,
+	mat4x3 camera_matrix,
+	float depth_scale,
 	vec4* __restrict__ rgba,
 	float* __restrict__ depth,
 	NerfPayload* __restrict__ payloads,
@@ -1826,21 +1313,26 @@ __global__ void shade_kernel_nerf(
 	float* __restrict__ depth_buffer
 ) {
 	const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
-	if (i >= n_elements) return;
+	if (i >= n_elements || render_mode == ERenderMode::Distortion) return;
 	NerfPayload& payload = payloads[i];
 
 	vec4 tmp = rgba[i];
 	if (render_mode == ERenderMode::Normals) {
 		vec3 n = normalize(tmp.xyz());
-		tmp.rgb = (0.5f * n + vec3(0.5f)) * tmp.a;
+		tmp.rgb() = (0.5f * n + 0.5f) * tmp.a;
 	} else if (render_mode == ERenderMode::Cost) {
 		float col = (float)payload.n_steps / 128;
 		tmp = {col, col, col, 1.0f};
+	} else if (gbuffer_hard_edges && render_mode == ERenderMode::Depth) {
+		tmp.rgb() = vec3(depth[i] * depth_scale);
+	} else if (gbuffer_hard_edges && render_mode == ERenderMode::Positions) {
+		vec3 pos = camera_matrix[3] + payload.dir / dot(payload.dir, camera_matrix[2]) * depth[i];
+		tmp.rgb() = (pos - 0.5f) / 2.0f + 0.5f;
 	}
 
 	if (!train_in_linear_colors && (render_mode == ERenderMode::Shade || render_mode == ERenderMode::Slice)) {
 		// Accumulate in linear colors
-		tmp.rgb = srgb_to_linear(tmp.rgb);
+		tmp.rgb() = srgb_to_linear(tmp.rgb());
 	}
 
 	frame_buffer[payload.idx] = tmp + frame_buffer[payload.idx] * (1.0f - tmp.a);
@@ -1914,13 +1406,14 @@ __global__ void init_rays_with_payload_kernel_nerf(
 
 	vec2 pixel_offset = ld_random_pixel_offset(snap_to_pixel_centers ? 0 : sample_index);
 	vec2 uv = vec2{(float)x + pixel_offset.x, (float)y + pixel_offset.y} / vec2(resolution);
-	float ray_time = rolling_shutter.x + rolling_shutter.y * uv.x + rolling_shutter.z * uv.y + rolling_shutter.w * ld_random_val(sample_index, idx * 72239731);
+	mat4x3 camera = get_xform_given_rolling_shutter({camera_matrix0, camera_matrix1}, rolling_shutter, uv, ld_random_val(sample_index, idx * 72239731));
+
 	Ray ray = uv_to_ray(
 		sample_index,
 		uv,
 		resolution,
 		focal_length,
-		camera_matrix0 * ray_time + camera_matrix1 * (1.f - ray_time),
+		camera,
 		screen_center,
 		parallax_shift,
 		near_distance,
@@ -1955,6 +1448,17 @@ __global__ void init_rays_with_payload_kernel_nerf(
 		return;
 	}
 
+	if (render_mode == ERenderMode::Distortion) {
+		vec2 uv_after_distortion = pos_to_uv(ray(1.0f), resolution, focal_length, camera, screen_center, parallax_shift, foveation);
+
+		frame_buffer[idx].rgb() = to_rgb((uv_after_distortion - uv) * 64.0f);
+		frame_buffer[idx].a = 1.0f;
+		depth_buffer[idx] = 1.0f;
+		payload.origin = ray(MAX_DEPTH());
+		payload.alive = false;
+		return;
+	}
+
 	ray.d = normalize(ray.d);
 
 	if (envmap) {
@@ -1969,20 +1473,6 @@ __global__ void init_rays_with_payload_kernel_nerf(
 		return;
 	}
 
-	if (render_mode == ERenderMode::Distortion) {
-		vec2 offset = vec2(0.0f);
-		if (distortion) {
-			offset += distortion.at_lerp(vec2{(float)x + 0.5f, (float)y + 0.5f} / vec2(resolution));
-		}
-
-		frame_buffer[idx].rgb() = to_rgb(offset * 50.0f);
-		frame_buffer[idx].a = 1.0f;
-		depth_buffer[idx] = 1.0f;
-		payload.origin = ray(MAX_DEPTH());
-		payload.alive = false;
-		return;
-	}
-
 	payload.origin = ray.o;
 	payload.dir = ray.d;
 	payload.t = t;
@@ -2140,7 +1630,7 @@ void Testbed::NerfTracer::init_rays_from_camera(
 }
 
 uint32_t Testbed::NerfTracer::trace(
-	NerfNetwork<network_precision_t>& network,
+	const std::shared_ptr<NerfNetwork<network_precision_t>>& network,
 	const BoundingBox& render_aabb,
 	const mat3& render_aabb_to_local,
 	const BoundingBox& train_aabb,
@@ -2198,9 +1688,9 @@ uint32_t Testbed::NerfTracer::trace(
 
 		// Want a large number of queries to saturate the GPU and to ensure compaction doesn't happen toooo frequently.
 		uint32_t target_n_queries = 2 * 1024 * 1024;
-		uint32_t n_steps_between_compaction = tcnn::clamp(target_n_queries / n_alive, (uint32_t)MIN_STEPS_INBETWEEN_COMPACTION, (uint32_t)MAX_STEPS_INBETWEEN_COMPACTION);
+		uint32_t n_steps_between_compaction = clamp(target_n_queries / n_alive, (uint32_t)MIN_STEPS_INBETWEEN_COMPACTION, (uint32_t)MAX_STEPS_INBETWEEN_COMPACTION);
 
-		uint32_t extra_stride = network.n_extra_dims() * sizeof(float);
+		uint32_t extra_stride = network->n_extra_dims() * sizeof(float);
 		PitchedPtr<NerfCoordinate> input_data((NerfCoordinate*)m_network_input, 1, 0, extra_stride);
 		linear_kernel(generate_next_nerf_network_inputs, 0, stream,
 			n_alive,
@@ -2218,15 +1708,15 @@ uint32_t Testbed::NerfTracer::trace(
 			cone_angle_constant,
 			extra_dims_gpu
 		);
-		uint32_t n_elements = next_multiple(n_alive * n_steps_between_compaction, tcnn::batch_size_granularity);
+		uint32_t n_elements = next_multiple(n_alive * n_steps_between_compaction, BATCH_SIZE_GRANULARITY);
 		GPUMatrix<float> positions_matrix((float*)m_network_input, (sizeof(NerfCoordinate) + extra_stride) / sizeof(float), n_elements);
-		GPUMatrix<network_precision_t, RM> rgbsigma_matrix((network_precision_t*)m_network_output, network.padded_output_width(), n_elements);
-		network.inference_mixed_precision(stream, positions_matrix, rgbsigma_matrix);
+		GPUMatrix<network_precision_t, RM> rgbsigma_matrix((network_precision_t*)m_network_output, network->padded_output_width(), n_elements);
+		network->inference_mixed_precision(stream, positions_matrix, rgbsigma_matrix);
 
 		if (render_mode == ERenderMode::Normals) {
-			network.input_gradient(stream, 3, positions_matrix, positions_matrix);
+			network->input_gradient(stream, 3, positions_matrix, positions_matrix);
 		} else if (render_mode == ERenderMode::EncodingVis) {
-			network.visualize_activation(stream, visualized_layer, visualized_dim, positions_matrix, positions_matrix);
+			network->visualize_activation(stream, visualized_layer, visualized_dim, positions_matrix, positions_matrix);
 		}
 
 		linear_kernel(composite_kernel_nerf, 0, stream,
@@ -2244,7 +1734,7 @@ uint32_t Testbed::NerfTracer::trace(
 			rays_current.payload,
 			input_data,
 			m_network_output,
-			network.padded_output_width(),
+			network->padded_output_width(),
 			n_steps_between_compaction,
 			render_mode,
 			grid,
@@ -2264,7 +1754,7 @@ uint32_t Testbed::NerfTracer::trace(
 }
 
 void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_width, uint32_t n_extra_dims, cudaStream_t stream) {
-	n_elements = next_multiple(n_elements, size_t(tcnn::batch_size_granularity));
+	n_elements = next_multiple(n_elements, size_t(BATCH_SIZE_GRANULARITY));
 	size_t num_floats = sizeof(NerfCoordinate) / sizeof(float) + n_extra_dims;
 	auto scratch = allocate_workspace_and_distribute<
 		vec4, float, NerfPayload, // m_rays[0]
@@ -2297,26 +1787,21 @@ void Testbed::NerfTracer::enlarge(size_t n_elements, uint32_t padded_output_widt
 	m_alive_counter = std::get<12>(scratch);
 }
 
-void Testbed::Nerf::Training::reset_extra_dims(default_rng_t& rng) {
-	uint32_t n_extra_dims = dataset.n_extra_dims();
-	std::vector<float> extra_dims_cpu(n_extra_dims * (dataset.n_images + 1)); // n_images + 1 since we use an extra 'slot' for the inference latent code
-	float* dst = extra_dims_cpu.data();
-	extra_dims_opt = std::vector<VarAdamOptimizer>(dataset.n_images, VarAdamOptimizer(n_extra_dims, 1e-4f));
-	for (uint32_t i = 0; i < dataset.n_images; ++i) {
-		vec3 light_dir = warp_direction(normalize(dataset.metadata[i].light_dir));
-		extra_dims_opt[i].reset_state();
-		std::vector<float>& optimzer_value = extra_dims_opt[i].variable();
-		for (uint32_t j = 0; j < n_extra_dims; ++j) {
-			if (dataset.has_light_dirs && j < 3) {
-				dst[j] = light_dir[j];
-			} else {
-				dst[j] = random_val(rng) * 2.0f - 1.0f;
-			}
-			optimzer_value[j] = dst[j];
-		}
-		dst += n_extra_dims;
+std::vector<float> Testbed::Nerf::Training::get_extra_dims_cpu(int trainview) const {
+	if (dataset.n_extra_dims() == 0) {
+		return {};
+	}
+
+	if (trainview < 0 || trainview >= dataset.n_images) {
+		throw std::runtime_error{"Invalid training view."};
 	}
-	extra_dims_gpu.resize_and_copy_from_host(extra_dims_cpu);
+
+	const float* extra_dims_src = extra_dims_gpu.data() + trainview * dataset.n_extra_dims();
+
+	std::vector<float> extra_dims_cpu(dataset.n_extra_dims());
+	CUDA_CHECK_THROW(cudaMemcpy(extra_dims_cpu.data(), extra_dims_src, dataset.n_extra_dims() * sizeof(float), cudaMemcpyDeviceToHost));
+
+	return extra_dims_cpu;
 }
 
 void Testbed::Nerf::Training::update_extra_dims() {
@@ -2332,29 +1817,11 @@ void Testbed::Nerf::Training::update_extra_dims() {
 	CUDA_CHECK_THROW(cudaMemcpyAsync(extra_dims_gpu.data(), extra_dims_cpu.data(), extra_dims_opt.size() * n_extra_dims * sizeof(float), cudaMemcpyHostToDevice));
 }
 
-const float* Testbed::get_inference_extra_dims(cudaStream_t stream) const {
-	if (m_nerf_network->n_extra_dims() == 0) {
-		return nullptr;
-	}
-	const float* extra_dims_src = m_nerf.training.extra_dims_gpu.data() + m_nerf.extra_dim_idx_for_inference * m_nerf.training.dataset.n_extra_dims();
-	if (!m_nerf.training.dataset.has_light_dirs) {
-		return extra_dims_src;
-	}
-
-	// the dataset has light directions, so we must construct a temporary buffer and fill it as requested.
-	// we use an extra 'slot' that was pre-allocated for us at the end of the extra_dims array.
-	size_t size = m_nerf_network->n_extra_dims() * sizeof(float);
-	float* dims_gpu = m_nerf.training.extra_dims_gpu.data() + m_nerf.training.dataset.n_images * m_nerf.training.dataset.n_extra_dims();
-	CUDA_CHECK_THROW(cudaMemcpyAsync(dims_gpu, extra_dims_src, size, cudaMemcpyDeviceToDevice, stream));
-	vec3 light_dir = warp_direction(normalize(m_nerf.light_dir));
-	CUDA_CHECK_THROW(cudaMemcpyAsync(dims_gpu, &light_dir, min(size, sizeof(vec3)), cudaMemcpyHostToDevice, stream));
-	return dims_gpu;
-}
-
 void Testbed::render_nerf(
 	cudaStream_t stream,
+	CudaDevice& device,
 	const CudaRenderBufferView& render_buffer,
-	NerfNetwork<precision_t>& nerf_network,
+	const std::shared_ptr<NerfNetwork<network_precision_t>>& nerf_network,
 	const uint8_t* density_grid_bitfield,
 	const vec2& focal_length,
 	const mat4x3& camera_matrix0,
@@ -2371,20 +1838,25 @@ void Testbed::render_nerf(
 
 	ERenderMode render_mode = visualized_dimension > -1 ? ERenderMode::EncodingVis : m_render_mode;
 
-	const float* extra_dims_gpu = get_inference_extra_dims(stream);
+	const float* extra_dims_gpu = m_nerf.get_rendering_extra_dims(stream);
 
 	NerfTracer tracer;
 
-	// Our motion vector code can't undo grid distortions -- so don't render grid distortion if DLSS is enabled
-	auto grid_distortion = m_nerf.render_with_lens_distortion && !m_dlss ? m_distortion.inference_view() : Buffer2DView<const vec2>{};
+	// Our motion vector code can't undo grid distortions -- so don't render grid distortion if DLSS is enabled.
+	// (Unless we're in distortion visualization mode, in which case the distortion grid is fine to visualize.)
+	auto grid_distortion =
+		m_nerf.render_with_lens_distortion && (!m_dlss || m_render_mode == ERenderMode::Distortion) ?
+		m_distortion.inference_view() :
+		Buffer2DView<const vec2>{};
+
 	Lens lens = m_nerf.render_with_lens_distortion ? m_nerf.render_lens : Lens{};
 
 	auto resolution = render_buffer.resolution;
 
 	tracer.init_rays_from_camera(
 		render_buffer.spp,
-		nerf_network.padded_output_width(),
-		nerf_network.n_extra_dims(),
+		nerf_network->padded_output_width(),
+		nerf_network->n_extra_dims(),
 		render_buffer.resolution,
 		focal_length,
 		camera_matrix0,
@@ -2413,11 +1885,13 @@ void Testbed::render_nerf(
 		stream
 	);
 
+	float depth_scale = 1.0f / m_nerf.training.dataset.scale;
+	bool render_2d = m_render_mode == ERenderMode::Slice || m_render_mode == ERenderMode::Distortion;
+
 	uint32_t n_hit;
-	if (m_render_mode == ERenderMode::Slice) {
+	if (render_2d) {
 		n_hit = tracer.n_rays_initialized();
 	} else {
-		float depth_scale = 1.0f / m_nerf.training.dataset.scale;
 		n_hit = tracer.trace(
 			nerf_network,
 			m_render_aabb,
@@ -2442,28 +1916,31 @@ void Testbed::render_nerf(
 			stream
 		);
 	}
-	RaysNerfSoa& rays_hit = m_render_mode == ERenderMode::Slice ? tracer.rays_init() : tracer.rays_hit();
+	RaysNerfSoa& rays_hit = render_2d ? tracer.rays_init() : tracer.rays_hit();
 
-	if (m_render_mode == ERenderMode::Slice) {
+	if (render_2d) {
 		// Store colors in the normal buffer
-		uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity);
-		const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + nerf_network.n_extra_dims();
-		const uint32_t extra_stride = nerf_network.n_extra_dims() * sizeof(float); // extra stride on top of base NerfCoordinate struct
+		uint32_t n_elements = next_multiple(n_hit, BATCH_SIZE_GRANULARITY);
+		const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + nerf_network->n_extra_dims();
+		const uint32_t extra_stride = nerf_network->n_extra_dims() * sizeof(float); // extra stride on top of base NerfCoordinate struct
 
 		GPUMatrix<float> positions_matrix{floats_per_coord, n_elements, stream};
 		GPUMatrix<float> rgbsigma_matrix{4, n_elements, stream};
 
-		linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)positions_matrix.data(), 1, 0, extra_stride), extra_dims_gpu );
+		linear_kernel(generate_nerf_network_inputs_at_current_position, 0, stream, n_hit, m_aabb, rays_hit.payload, PitchedPtr<NerfCoordinate>((NerfCoordinate*)positions_matrix.data(), 1, 0, extra_stride), extra_dims_gpu);
 
 		if (visualized_dimension == -1) {
-			nerf_network.inference(stream, positions_matrix, rgbsigma_matrix);
+			nerf_network->inference(stream, positions_matrix, rgbsigma_matrix);
 			linear_kernel(compute_nerf_rgba_kernel, 0, stream, n_hit, (vec4*)rgbsigma_matrix.data(), m_nerf.rgb_activation, m_nerf.density_activation, 0.01f, false);
 		} else {
-			nerf_network.visualize_activation(stream, m_visualized_layer, visualized_dimension, positions_matrix, rgbsigma_matrix);
+			nerf_network->visualize_activation(stream, m_visualized_layer, visualized_dimension, positions_matrix, rgbsigma_matrix);
 		}
 
 		linear_kernel(shade_kernel_nerf, 0, stream,
 			n_hit,
+			m_nerf.render_gbuffer_hard_edges,
+			camera_matrix1,
+			depth_scale,
 			(vec4*)rgbsigma_matrix.data(),
 			nullptr,
 			rays_hit.payload,
@@ -2477,6 +1954,9 @@ void Testbed::render_nerf(
 
 	linear_kernel(shade_kernel_nerf, 0, stream,
 		n_hit,
+		m_nerf.render_gbuffer_hard_edges,
+		camera_matrix1,
+		depth_scale,
 		rays_hit.rgba,
 		rays_hit.depth,
 		rays_hit.payload,
@@ -2601,7 +2081,7 @@ void Testbed::Nerf::Training::export_camera_extrinsics(const fs::path& path, boo
 
 mat4x3 Testbed::Nerf::Training::get_camera_extrinsics(int frame_idx) {
 	if (frame_idx < 0 || frame_idx >= dataset.n_images) {
-		return mat4x3(1.0f);
+		return mat4x3::identity();
 	}
 	return dataset.ngp_matrix_to_nerf(transforms[frame_idx].start);
 }
@@ -2686,7 +2166,7 @@ void Testbed::load_nerf_post() { // moved the second half of load_nerf here
 	m_nerf.training.cam_focal_length_gradient = vec2(0.0f);
 	m_nerf.training.cam_focal_length_gradient_gpu.resize_and_copy_from_host(&m_nerf.training.cam_focal_length_gradient, 1);
 
-	m_nerf.training.reset_extra_dims(m_rng);
+	m_nerf.reset_extra_dims(m_rng);
 	m_nerf.training.optimize_extra_dims = m_nerf.training.dataset.n_extra_learnable_dims > 0;
 
 	if (m_nerf.training.dataset.has_rays) {
@@ -2694,17 +2174,18 @@ void Testbed::load_nerf_post() { // moved the second half of load_nerf here
 	}
 
 	// Perturbation of the training cameras -- for debugging the online extrinsics learning code
-	// float perturb_amount = 0.0f;
+	// float perturb_amount = 0.01f;
 	// if (perturb_amount > 0.f) {
 	// 	for (uint32_t i = 0; i < m_nerf.training.dataset.n_images; ++i) {
-	// 		vec3 rot = random_val_3d(m_rng) * perturb_amount;
-	// 		float angle = rot.norm();
+	// 		vec3 rot = (random_val_3d(m_rng) * 2.0f - 1.0f) * perturb_amount;
+	// 		vec3 trans = (random_val_3d(m_rng) * 2.0f - 1.0f) * perturb_amount;
+	// 		float angle = length(rot);
 	// 		rot /= angle;
-	// 		auto trans = random_val_3d(m_rng);
-	// 		m_nerf.training.dataset.xforms[i].start.block<3,3>(0,0) = AngleAxisf(angle, rot).matrix() * m_nerf.training.dataset.xforms[i].start.block<3,3>(0,0);
-	// 		m_nerf.training.dataset.xforms[i].start[3] += trans * perturb_amount;
-	// 		m_nerf.training.dataset.xforms[i].end.block<3,3>(0,0) = AngleAxisf(angle, rot).matrix() * m_nerf.training.dataset.xforms[i].end.block<3,3>(0,0);
-	// 		m_nerf.training.dataset.xforms[i].end[3] += trans * perturb_amount;
+
+	// 		auto rot_start = rotmat(angle, rot) * mat3(m_nerf.training.dataset.xforms[i].start);
+	// 		auto rot_end = rotmat(angle, rot) * mat3(m_nerf.training.dataset.xforms[i].end);
+	// 		m_nerf.training.dataset.xforms[i].start = mat4x3(rot_start[0], rot_start[1], rot_start[2], m_nerf.training.dataset.xforms[i].start[3] + trans);
+	// 		m_nerf.training.dataset.xforms[i].end = mat4x3(rot_end[0], rot_end[1], rot_end[2], m_nerf.training.dataset.xforms[i].end[3] + trans);
 	// 	}
 	// }
 
@@ -2898,12 +2379,12 @@ __global__ void mark_density_grid_in_sphere_empty_kernel(const uint32_t n_elemen
 	uint32_t level = i / NERF_GRID_N_CELLS();
 	uint32_t pos_idx = i % NERF_GRID_N_CELLS();
 
-	uint32_t x = tcnn::morton3D_invert(pos_idx>>0);
-	uint32_t y = tcnn::morton3D_invert(pos_idx>>1);
-	uint32_t z = tcnn::morton3D_invert(pos_idx>>2);
+	uint32_t x = morton3D_invert(pos_idx>>0);
+	uint32_t y = morton3D_invert(pos_idx>>1);
+	uint32_t z = morton3D_invert(pos_idx>>2);
 
 	float cell_radius = scalbnf(SQRT3(), level) / NERF_GRIDSIZE();
-	vec3 cell_pos = ((vec3{(float)x+0.5f, (float)y+0.5f, (float)z+0.5f}) / (float)NERF_GRIDSIZE() - vec3(0.5f)) * scalbnf(1.0f, level) + vec3(0.5f);
+	vec3 cell_pos = ((vec3{(float)x+0.5f, (float)y+0.5f, (float)z+0.5f}) / (float)NERF_GRIDSIZE() - 0.5f) * scalbnf(1.0f, level) + 0.5f;
 
 	// Disable if the cell touches the sphere (conservatively, by bounding the cell with a sphere)
 	if (distance(pos, cell_pos) < radius + cell_radius) {
@@ -2952,7 +2433,7 @@ float Testbed::NerfCounters::update_after_training(uint32_t target_batch_size, b
 	}
 
 	rays_per_batch = (uint32_t)((float)rays_per_batch * (float)target_batch_size / (float)measured_batch_size);
-	rays_per_batch = std::min(next_multiple(rays_per_batch, tcnn::batch_size_granularity), 1u << 18);
+	rays_per_batch = std::min(next_multiple(rays_per_batch, BATCH_SIZE_GRANULARITY), 1u << 18);
 
 	return loss_scalar;
 }
@@ -2998,7 +2479,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS
 		uint32_t n_samples_per_image = (m_nerf.training.n_steps_between_error_map_updates * m_nerf.training.counters_rgb.rays_per_batch) / m_nerf.training.dataset.n_images;
 		ivec2 res = m_nerf.training.dataset.metadata[0].resolution;
 		m_nerf.training.error_map.resolution = min(ivec2((int)(std::sqrt(std::sqrt((float)n_samples_per_image)) * 3.5f)), res);
-		m_nerf.training.error_map.data.resize(compMul(m_nerf.training.error_map.resolution) * m_nerf.training.dataset.n_images);
+		m_nerf.training.error_map.data.resize(product(m_nerf.training.error_map.resolution) * m_nerf.training.dataset.n_images);
 		CUDA_CHECK_THROW(cudaMemsetAsync(m_nerf.training.error_map.data.data(), 0, m_nerf.training.error_map.data.get_bytes(), stream));
 	}
 
@@ -3007,16 +2488,15 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS
 		CUDA_CHECK_THROW(cudaMemsetAsync(envmap_gradient, 0, sizeof(float)*m_envmap.envmap->n_params(), stream));
 	}
 
-
 	train_nerf_step(target_batch_size, m_nerf.training.counters_rgb, stream);
 
 
-	m_trainer->optimizer_step(stream, LOSS_SCALE);
+	m_trainer->optimizer_step(stream, LOSS_SCALE());
 
 	++m_training_step;
 
 	if (envmap_gradient) {
-		m_envmap.trainer->optimizer_step(stream, LOSS_SCALE);
+		m_envmap.trainer->optimizer_step(stream, LOSS_SCALE());
 	}
 
 	float loss_scalar = m_nerf.training.counters_rgb.update_after_training(target_batch_size, get_loss_scalar, stream);
@@ -3038,7 +2518,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS
 	bool accumulate_error = true;
 	if (accumulate_error && m_nerf.training.n_steps_since_error_map_update >= m_nerf.training.n_steps_between_error_map_updates) {
 		m_nerf.training.error_map.cdf_resolution = m_nerf.training.error_map.resolution;
-		m_nerf.training.error_map.cdf_x_cond_y.resize(compMul(m_nerf.training.error_map.cdf_resolution) * m_nerf.training.dataset.n_images);
+		m_nerf.training.error_map.cdf_x_cond_y.resize(product(m_nerf.training.error_map.cdf_resolution) * m_nerf.training.dataset.n_images);
 		m_nerf.training.error_map.cdf_y.resize(m_nerf.training.error_map.cdf_resolution.y * m_nerf.training.dataset.n_images);
 		m_nerf.training.error_map.cdf_img.resize(m_nerf.training.dataset.n_images);
 
@@ -3097,7 +2577,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS
 		for (uint32_t i = 0; i < m_nerf.training.n_images_for_training; ++i) {
 			std::vector<float> gradient(n_extra_dims);
 			for (uint32_t j = 0; j < n_extra_dims; ++j) {
-				gradient[j] = extra_dims_gradient[i * n_extra_dims + j] / LOSS_SCALE;
+				gradient[j] = extra_dims_gradient[i * n_extra_dims + j] / LOSS_SCALE();
 			}
 
 			//float l2_reg = 1e-4f;
@@ -3112,7 +2592,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS
 
 	bool train_camera = m_nerf.training.optimize_extrinsics || m_nerf.training.optimize_distortion || m_nerf.training.optimize_focal_length || m_nerf.training.optimize_exposure;
 	if (train_camera && m_nerf.training.n_steps_since_cam_update >= m_nerf.training.n_steps_between_cam_updates) {
-		float per_camera_loss_scale = (float)m_nerf.training.n_images_for_training / LOSS_SCALE / (float)m_nerf.training.n_steps_between_cam_updates;
+		float per_camera_loss_scale = (float)m_nerf.training.n_images_for_training / LOSS_SCALE() / (float)m_nerf.training.n_steps_between_cam_updates;
 
 		if (m_nerf.training.optimize_extrinsics) {
 			CUDA_CHECK_THROW(cudaMemcpyAsync(m_nerf.training.cam_pos_gradient.data(), m_nerf.training.cam_pos_gradient_gpu.data(), m_nerf.training.cam_pos_gradient_gpu.get_bytes(), cudaMemcpyDeviceToHost, stream));
@@ -3145,7 +2625,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS
 				m_distortion.map->gradients(),
 				m_distortion.map->gradient_weights()
 			);
-			m_distortion.trainer->optimizer_step(stream, LOSS_SCALE*(float)m_nerf.training.n_steps_between_cam_updates);
+			m_distortion.trainer->optimizer_step(stream, LOSS_SCALE() * (float)m_nerf.training.n_steps_between_cam_updates);
 		}
 
 		if (m_nerf.training.optimize_focal_length) {
@@ -3161,7 +2641,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS
 
 		if (m_nerf.training.optimize_exposure) {
 			CUDA_CHECK_THROW(cudaMemcpyAsync(m_nerf.training.cam_exposure_gradient.data(), m_nerf.training.cam_exposure_gradient_gpu.data(), m_nerf.training.cam_exposure_gradient_gpu.get_bytes(), cudaMemcpyDeviceToHost, stream));
-			CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
+
 			vec3 mean_exposure = vec3(0.0f);
 
 			// Optimization step
@@ -3177,7 +2657,7 @@ void Testbed::train_nerf(uint32_t target_batch_size, bool get_loss_scalar, cudaS
 				mean_exposure += m_nerf.training.cam_exposure[i].variable();
 			}
 
-			mean_exposure /= m_nerf.training.n_images_for_training;
+			mean_exposure /= (float)m_nerf.training.n_images_for_training;
 
 			// Renormalize
 			std::vector<vec3> cam_exposures(m_nerf.training.n_images_for_training);
@@ -3243,7 +2723,7 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters&
 	if (counters.measured_batch_size_before_compaction == 0) {
 		counters.measured_batch_size_before_compaction = max_inference = max_samples;
 	} else {
-		max_inference = next_multiple(std::min(counters.measured_batch_size_before_compaction, max_samples), tcnn::batch_size_granularity);
+		max_inference = next_multiple(std::min(counters.measured_batch_size_before_compaction, max_samples), BATCH_SIZE_GRANULARITY);
 	}
 
 	GPUMatrix<float> compacted_coords_matrix((float*)coords_compacted, floats_per_coord, target_batch_size);
@@ -3273,6 +2753,7 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters&
 
 	auto hg_enc = dynamic_cast<GridEncoding<network_precision_t>*>(m_encoding.get());
 
+	{
 		linear_kernel(generate_training_samples_nerf, 0, stream,
 			counters.rays_per_batch,
 			m_aabb,
@@ -3323,13 +2804,13 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters&
 			m_rng,
 			target_batch_size,
 			ray_counter,
-			LOSS_SCALE,
+			LOSS_SCALE(),
 			padded_output_width,
 			m_envmap.view(),
 			envmap_gradient,
 			m_envmap.resolution,
 			m_envmap.loss_type,
-			m_background_color.rgb,
+			m_background_color.rgb(),
 			m_color_space,
 			m_nerf.training.random_bg_color,
 			m_nerf.training.linear_colors,
@@ -3368,14 +2849,15 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters&
 			m_nerf.training.depth_supervision_lambda,
 			m_nerf.training.near_distance
 		);
+	}
 
-	fill_rollover_and_rescale<network_precision_t><<<n_blocks_linear(target_batch_size*padded_output_width), n_threads_linear, 0, stream>>>(
+	fill_rollover_and_rescale<network_precision_t><<<n_blocks_linear(target_batch_size*padded_output_width), N_THREADS_LINEAR, 0, stream>>>(
 		target_batch_size, padded_output_width, counters.numsteps_counter_compacted.data(), dloss_dmlp_out
 	);
-	fill_rollover<float><<<n_blocks_linear(target_batch_size * floats_per_coord), n_threads_linear, 0, stream>>>(
+	fill_rollover<float><<<n_blocks_linear(target_batch_size * floats_per_coord), N_THREADS_LINEAR, 0, stream>>>(
 		target_batch_size, floats_per_coord, counters.numsteps_counter_compacted.data(), (float*)coords_compacted
 	);
-	fill_rollover<float><<<n_blocks_linear(target_batch_size), n_threads_linear, 0, stream>>>(
+	fill_rollover<float><<<n_blocks_linear(target_batch_size), N_THREADS_LINEAR, 0, stream>>>(
 		target_batch_size, 1, counters.numsteps_counter_compacted.data(), max_level_compacted
 	);
 
@@ -3384,10 +2866,7 @@ void Testbed::train_nerf_step(uint32_t target_batch_size, Testbed::NerfCounters&
 	bool prepare_input_gradients = train_camera || train_extra_dims;
 	GPUMatrix<float> coords_gradient_matrix((float*)coords_gradient, floats_per_coord, target_batch_size);
 
-	{
-		auto ctx = m_network->forward(stream, compacted_coords_matrix, &compacted_rgbsigma_matrix, false, prepare_input_gradients);
-		m_network->backward(stream, *ctx, compacted_coords_matrix, compacted_rgbsigma_matrix, gradient_matrix, prepare_input_gradients ? &coords_gradient_matrix : nullptr, false, EGradientMode::Overwrite);
-	}
+	m_trainer->training_step(stream, compacted_coords_matrix, {}, nullptr, false, prepare_input_gradients ? &coords_gradient_matrix : nullptr, false, GradientMode::Overwrite, &gradient_matrix);
 
 	if (train_extra_dims) {
 		// Compute extra-dim gradients
@@ -3473,7 +2952,7 @@ void Testbed::optimise_mesh_step(uint32_t n_steps) {
 	GPUMatrix<float> positions_matrix((float*)coords.data(), floats_per_coord, n_verts);
 	GPUMatrix<network_precision_t, RM> density_matrix(mlp_out.data(), padded_output_width, n_verts);
 
-	const float* extra_dims_gpu = get_inference_extra_dims(m_stream.get());
+	const float* extra_dims_gpu = m_nerf.get_rendering_extra_dims(m_stream.get());
 
 	for (uint32_t i = 0; i < n_steps; ++i) {
 		linear_kernel(generate_nerf_network_inputs_from_positions, 0, m_stream.get(),
@@ -3521,7 +3000,7 @@ void Testbed::compute_mesh_vertex_colors() {
 	m_mesh.vert_colors.memset(0);
 
 	if (m_testbed_mode == ETestbedMode::Nerf) {
-		const float* extra_dims_gpu = get_inference_extra_dims(m_stream.get());
+		const float* extra_dims_gpu = m_nerf.get_rendering_extra_dims(m_stream.get());
 
 		const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + m_nerf_network->n_extra_dims();
 		const uint32_t extra_stride = m_nerf_network->n_extra_dims() * sizeof(float);
@@ -3591,7 +3070,7 @@ GPUMemory<vec4> Testbed::get_rgba_on_grid(ivec3 res3d, vec3 ray_dir, bool voxel_
 	const uint32_t n_elements = (res3d.x*res3d.y*res3d.z);
 	GPUMemory<vec4> rgba(n_elements);
 
-	const float* extra_dims_gpu = get_inference_extra_dims(m_stream.get());
+	const float* extra_dims_gpu = m_nerf.get_rendering_extra_dims(m_stream.get());
 
 	const uint32_t floats_per_coord = sizeof(NerfCoordinate) / sizeof(float) + m_nerf_network->n_extra_dims();
 	const uint32_t extra_stride = m_nerf_network->n_extra_dims() * sizeof(float);
@@ -3672,18 +3151,105 @@ uint8_t* Testbed::Nerf::get_density_grid_bitfield_mip(uint32_t mip) {
 	return density_grid_bitfield.data() + grid_mip_offset(mip)/8;
 }
 
-int Testbed::find_best_training_view(int default_view) {
-	int bestimage = default_view;
-	float bestscore = 1000.f;
-	for (int i = 0; i < m_nerf.training.n_images_for_training; ++i) {
-		float score = distance(m_nerf.training.transforms[i].start[3], m_camera[3]);
-		score += 0.25f * distance(m_nerf.training.transforms[i].start[2], m_camera[2]);
+void Testbed::Nerf::reset_extra_dims(default_rng_t& rng) {
+	uint32_t n_extra_dims = training.dataset.n_extra_dims();
+	std::vector<float> extra_dims_cpu(n_extra_dims * (training.dataset.n_images + 1)); // n_images + 1 since we use an extra 'slot' for the inference latent code
+	float* dst = extra_dims_cpu.data();
+	training.extra_dims_opt = std::vector<VarAdamOptimizer>(training.dataset.n_images, VarAdamOptimizer(n_extra_dims, 1e-4f));
+	for (uint32_t i = 0; i < training.dataset.n_images; ++i) {
+		vec3 light_dir = warp_direction(normalize(training.dataset.metadata[i].light_dir));
+		training.extra_dims_opt[i].reset_state();
+		std::vector<float>& optimzer_value = training.extra_dims_opt[i].variable();
+		for (uint32_t j = 0; j < n_extra_dims; ++j) {
+			if (training.dataset.has_light_dirs && j < 3) {
+				dst[j] = light_dir[j];
+			} else {
+				dst[j] = random_val(rng) * 2.0f - 1.0f;
+			}
+			optimzer_value[j] = dst[j];
+		}
+		dst += n_extra_dims;
+	}
+	training.extra_dims_gpu.resize_and_copy_from_host(extra_dims_cpu);
+
+	rendering_extra_dims.resize(training.dataset.n_extra_dims());
+	CUDA_CHECK_THROW(cudaMemcpy(rendering_extra_dims.data(), training.extra_dims_gpu.data(), rendering_extra_dims.bytes(), cudaMemcpyDeviceToDevice));
+}
+
+const float* Testbed::Nerf::get_rendering_extra_dims(cudaStream_t stream) const {
+	CHECK_THROW(rendering_extra_dims.size() == training.dataset.n_extra_dims());
+
+	if (training.dataset.n_extra_dims() == 0) {
+		return nullptr;
+	}
+
+	const float* extra_dims_src = rendering_extra_dims_from_training_view >= 0 ?
+		training.extra_dims_gpu.data() + rendering_extra_dims_from_training_view * training.dataset.n_extra_dims() :
+		rendering_extra_dims.data();
+
+	if (!training.dataset.has_light_dirs) {
+		return extra_dims_src;
+	}
+
+	// the dataset has light directions, so we must construct a temporary buffer and fill it as requested.
+	// we use an extra 'slot' that was pre-allocated for us at the end of the extra_dims array.
+	size_t size = training.dataset.n_extra_dims() * sizeof(float);
+	float* dims_gpu = training.extra_dims_gpu.data() + training.dataset.n_images * training.dataset.n_extra_dims();
+	CUDA_CHECK_THROW(cudaMemcpyAsync(dims_gpu, extra_dims_src, size, cudaMemcpyDeviceToDevice, stream));
+	vec3 light_dir = warp_direction(normalize(light_dir));
+	CUDA_CHECK_THROW(cudaMemcpyAsync(dims_gpu, &light_dir, min(size, sizeof(vec3)), cudaMemcpyHostToDevice, stream));
+	return dims_gpu;
+}
+
+int Testbed::Nerf::find_closest_training_view(mat4x3 pose) const {
+	int bestimage = training.view;
+	float bestscore = std::numeric_limits<float>::infinity();
+	for (int i = 0; i < training.n_images_for_training; ++i) {
+		float score = distance(training.transforms[i].start[3], pose[3]);
+		score += 0.25f * distance(training.transforms[i].start[2], pose[2]);
 		if (score < bestscore) {
 			bestscore = score;
 			bestimage = i;
 		}
 	}
+
 	return bestimage;
 }
 
-NGP_NAMESPACE_END
+void Testbed::Nerf::set_rendering_extra_dims_from_training_view(int trainview) {
+	if (!training.dataset.n_extra_dims()) {
+		throw std::runtime_error{"Dataset does not have extra dims."};
+	}
+
+	if (trainview < 0 || trainview >= training.dataset.n_images) {
+		throw std::runtime_error{"Invalid training view."};
+	}
+
+	rendering_extra_dims_from_training_view = trainview;
+}
+
+void Testbed::Nerf::set_rendering_extra_dims(const std::vector<float>& vals) {
+	CHECK_THROW(rendering_extra_dims.size() == training.dataset.n_extra_dims());
+
+	if (vals.size() != training.dataset.n_extra_dims()) {
+		throw std::runtime_error{fmt::format("Invalid number of extra dims. Got {} but must be {}.", vals.size(), training.dataset.n_extra_dims())};
+	}
+
+	rendering_extra_dims_from_training_view = -1;
+	rendering_extra_dims.copy_from_host(vals);
+}
+
+std::vector<float> Testbed::Nerf::get_rendering_extra_dims_cpu() const {
+	CHECK_THROW(rendering_extra_dims.size() == training.dataset.n_extra_dims());
+
+	if (training.dataset.n_extra_dims() == 0) {
+		return {};
+	}
+
+	std::vector<float> extra_dims_cpu(training.dataset.n_extra_dims());
+	CUDA_CHECK_THROW(cudaMemcpy(extra_dims_cpu.data(), get_rendering_extra_dims(nullptr), rendering_extra_dims.bytes(), cudaMemcpyDeviceToHost));
+
+	return extra_dims_cpu;
+}
+
+}
diff --git a/src/testbed_sdf.cu b/src/testbed_sdf.cu
index 1aa41b4b8..006d53781 100644
--- a/src/testbed_sdf.cu
+++ b/src/testbed_sdf.cu
@@ -30,9 +30,7 @@
 #include <tiny-cuda-nn/network_with_input_encoding.h>
 #include <tiny-cuda-nn/trainer.h>
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 static constexpr uint32_t MARCH_ITER = 10000;
 
@@ -108,12 +106,12 @@ __device__ vec3 evaluate_shading(
 		return amb;
 	}
 
-	float luminance = dot(base_color, vec3(0.3f, 0.6f, 0.1f));
+	float luminance = dot(base_color, vec3{0.3f, 0.6f, 0.1f});
 
 	// normalize luminance to isolate hue and saturation components
 	vec3 Ctint = base_color * (1.f/(luminance+0.00001f));
-	vec3 Cspec0 = mix(mix(vec3(1.0f,1.0f,1.0f), Ctint, specular_tint) * specular * 0.08f, base_color, metallic);
-	vec3 Csheen = mix(vec3(1.0f,1.0f,1.0f), Ctint, sheen_tint);
+	vec3 Cspec0 = mix(mix(vec3(1.0f), Ctint, specular_tint) * specular * 0.08f, base_color, metallic);
+	vec3 Csheen = mix(vec3(1.0f), Ctint, sheen_tint);
 
 	float Fd90 = 0.5f + 2.0f * LdotH * LdotH * roughness;
 	float Fd = mix(1, Fd90, FL) * mix(1.f, Fd90, FV);
@@ -129,7 +127,7 @@ __device__ vec3 evaluate_shading(
 	float a= std::max(0.001f, square(roughness));
 	float Ds = G2(NdotH, a);
 	float FH = SchlickFresnel(LdotH);
-	vec3 Fs = mix(Cspec0, vec3(1.0f,1.0f,1.0f), FH);
+	vec3 Fs = mix(Cspec0, vec3(1.0f), FH);
 	float Gs = SmithG_GGX(NdotL, a) * SmithG_GGX(NdotV, a);
 
 	// sheen
@@ -142,7 +140,7 @@ __device__ vec3 evaluate_shading(
 
 	float CCs=0.25f * clearcoat * Gr * Fr * Dr;
 	vec3 brdf = (float(1.0f / PI()) * mix(Fd, ss, subsurface) * base_color + Fsheen) * (1.0f - metallic) +
-		Gs * Fs * Ds + vec3(CCs,CCs,CCs);
+		Gs * Fs * Ds + vec3{CCs, CCs, CCs};
 	return vec3(brdf * light_color) * NdotL + amb;
 }
 
@@ -324,7 +322,7 @@ __global__ void shade_kernel_sdf(
 	vec3 pos = positions[i];
 	bool floor = false;
 	if (pos.y < floor_y + 0.001f && payload.dir.y < 0.f) {
-		normal = vec3(0.f, 1.f, 0.f);
+		normal = vec3{0.0f, 1.0f, 0.0f};
 		floor = true;
 	}
 
@@ -361,14 +359,14 @@ __global__ void shade_kernel_sdf(
 		} break;
 		case ERenderMode::Depth: color = vec3(dot(cam_fwd, pos - cam_pos)); break;
 		case ERenderMode::Positions: {
-			color = (pos - vec3(0.5f)) / 2.0f + vec3(0.5f);
+			color = (pos - 0.5f) / 2.0f + 0.5f;
 		} break;
-		case ERenderMode::Normals: color = 0.5f * normal + vec3(0.5f); break;
+		case ERenderMode::Normals: color = 0.5f * normal + 0.5f; break;
 		case ERenderMode::Cost: color = vec3((float)payload.n_steps / 30); break;
 		case ERenderMode::EncodingVis: color = normals[i]; break;
 	}
 
-	frame_buffer[payload.idx] = {color.rgb, 1.0f};
+	frame_buffer[payload.idx] = {color.r, color.g, color.b, 1.0f};
 	depth_buffer[payload.idx] = dot(cam_fwd, pos - cam_pos);
 }
 
@@ -543,7 +541,7 @@ __global__ void init_rays_with_payload_kernel_sdf(
 
 	Ray ray = pixel_to_ray(
 		sample_index,
-		{x, y},
+		{(int)x, (int)y},
 		resolution,
 		focal_length,
 		camera_matrix,
@@ -798,7 +796,7 @@ uint32_t Testbed::SphereTracer::trace(
 }
 
 void Testbed::SphereTracer::enlarge(size_t n_elements, cudaStream_t stream) {
-	n_elements = next_multiple(n_elements, size_t(tcnn::batch_size_granularity));
+	n_elements = next_multiple(n_elements, size_t(BATCH_SIZE_GRANULARITY));
 	auto scratch = allocate_workspace_and_distribute<
 		vec3, vec3, float, float, float, float, SdfPayload, // m_rays[0]
 		vec3, vec3, float, float, float, float, SdfPayload, // m_rays[1]
@@ -824,7 +822,7 @@ void Testbed::SphereTracer::enlarge(size_t n_elements, cudaStream_t stream) {
 }
 
 void Testbed::FiniteDifferenceNormalsApproximator::enlarge(uint32_t n_elements, cudaStream_t stream) {
-	n_elements = next_multiple(n_elements, tcnn::batch_size_granularity);
+	n_elements = next_multiple(n_elements, BATCH_SIZE_GRANULARITY);
 	auto scratch = allocate_workspace_and_distribute<
 		vec3, vec3, vec3,
 		float, float, float,
@@ -960,10 +958,10 @@ void Testbed::render_sdf(
 	if (m_render_mode == ERenderMode::Slice) {
 		if (visualized_dimension == -1) {
 			distance_function(n_hit, rays_hit.pos, rays_hit.distance, stream);
-			extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), n_threads_linear, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance, CM, (float*)rays_hit.normal);
+			extract_dimension_pos_neg_kernel<float><<<n_blocks_linear(n_hit*3), N_THREADS_LINEAR, 0, stream>>>(n_hit*3, 0, 1, 3, rays_hit.distance, CM, (float*)rays_hit.normal);
 		} else {
 			// Store colors in the normal buffer
-			uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity);
+			uint32_t n_elements = next_multiple(n_hit, BATCH_SIZE_GRANULARITY);
 
 			GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements);
 			GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements);
@@ -1024,7 +1022,7 @@ void Testbed::render_sdf(
 		}
 	} else if (render_mode == ERenderMode::EncodingVis && m_render_mode != ERenderMode::Slice) {
 		// HACK: Store colors temporarily in the normal buffer
-		uint32_t n_elements = next_multiple(n_hit, tcnn::batch_size_granularity);
+		uint32_t n_elements = next_multiple(n_hit, BATCH_SIZE_GRANULARITY);
 
 		GPUMatrix<float> positions_matrix((float*)rays_hit.pos, 3, n_elements);
 		GPUMatrix<float> colors_matrix((float*)rays_hit.normal, 3, n_elements);
@@ -1124,13 +1122,13 @@ void Testbed::load_mesh(const fs::path& data_path) {
 	const float inflation = 0.005f;
 
 	m_raw_aabb.inflate(length(m_raw_aabb.diag()) * inflation);
-	m_sdf.mesh_scale = compMax(m_raw_aabb.diag());
+	m_sdf.mesh_scale = max(m_raw_aabb.diag());
 
 	// Normalize vertex coordinates to lie within [0,1]^3.
 	// This way, none of the constants need to carry around
 	// bounding box factors.
 	for (size_t i = 0; i < n_vertices; ++i) {
-		vertices[i] = (vertices[i] - m_raw_aabb.min - 0.5f * m_raw_aabb.diag()) / m_sdf.mesh_scale + vec3(0.5f);
+		vertices[i] = (vertices[i] - m_raw_aabb.min - 0.5f * m_raw_aabb.diag()) / m_sdf.mesh_scale + 0.5f;
 	}
 
 	m_aabb = {};
@@ -1141,7 +1139,7 @@ void Testbed::load_mesh(const fs::path& data_path) {
 	m_aabb.inflate(length(m_aabb.diag()) * inflation);
 	m_aabb = m_aabb.intersection(BoundingBox{vec3(0.0f), vec3(1.0f)});
 	m_render_aabb = m_aabb;
-	m_render_aabb_to_local = mat3(1.0f);
+	m_render_aabb_to_local = mat3::identity();
 	m_mesh.thresh = 0.f;
 
 	m_sdf.triangles_cpu.resize(n_triangles);
@@ -1397,4 +1395,4 @@ double Testbed::calculate_iou(uint32_t n_samples, float scale_existing_results_f
 	return countercpu[4]/double(countercpu[5]);
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/testbed_volume.cu b/src/testbed_volume.cu
index 63581115e..efdfff42e 100644
--- a/src/testbed_volume.cu
+++ b/src/testbed_volume.cu
@@ -31,9 +31,7 @@
 
 #include <fstream>
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 Testbed::NetworkDims Testbed::network_dims_volume() const {
 	NetworkDims dims;
@@ -54,7 +52,7 @@ __device__ vec4 proc_envmap(const vec3& dir, const vec3& up_dir, const vec3& sun
 	sunam *= sunam;
 
 	vec4 result;
-	result.rgb = skycol * skyam + vec3{255.f/255.0f, 215.f/255.0f, 195.f/255.0f} * (20.f * sunam);
+	result.rgb() = skycol * skyam + vec3{255.f/255.0f, 215.f/255.0f, 195.f/255.0f} * (20.f * sunam);
 	result.a = 1.0f;
 	return result;
 }
@@ -75,7 +73,7 @@ __device__ inline bool walk_to_next_event(default_rng_t &rng, const BoundingBox
 		float dt = -std::log(1.0f - zeta1) * scale; // todo - for spatially varying majorant, we must check dt against the range over which the majorant is defined. we can turn this into an optical thickness accumulating loop...
 		pos += dir*dt;
 		if (!aabb.contains(pos)) return false; // escape to the mooon!
-		uint32_t bitidx = tcnn::morton3D(int(pos.x*128.f+0.5f),int(pos.y*128.f+0.5f),int(pos.z*128.f+0.5f));
+		uint32_t bitidx = morton3D(int(pos.x*128.f+0.5f),int(pos.y*128.f+0.5f),int(pos.z*128.f+0.5f));
 		if (bitidx<128*128*128 && bitgrid[bitidx>>3]&(1<<(bitidx&7))) break;
 		// loop around and try again as we are in density=0 region!
 	}
@@ -112,7 +110,7 @@ __global__ void volume_generate_training_data_kernel(uint32_t n_elements,
 	auto acc = grid->tree().getAccessor();
 	while (numout < MAX_TRAIN_VERTICES) {
 		uint32_t prev_numout = numout;
-		vec3 pos = random_dir(rng) * 2.0f + vec3(0.5f);
+		vec3 pos = random_dir(rng) * 2.0f + 0.5f;
 		vec3 target = random_val_3d(rng) * aabb.diag() + aabb.min;
 		vec3 dir = normalize(target - pos);
 		auto box_intersection = aabb.ray_intersect(pos, dir);
@@ -169,25 +167,25 @@ void Testbed::train_volume(size_t target_batch_size, bool get_loss_scalar, cudaS
 	m_volume.training.targets.enlarge(n_elements);
 
 	float distance_scale = 1.f/std::max(m_volume.inv_distance_scale,0.01f);
-	auto sky_col = m_background_color.rgb;
+	auto sky_col = m_background_color.rgb();
 
 	linear_kernel(volume_generate_training_data_kernel, 0, stream, n_elements / MAX_TRAIN_VERTICES,
-		    m_volume.training.positions.data(),
-			m_volume.training.targets.data(),
-			m_volume.nanovdb_grid.data(),
-			m_volume.bitgrid.data(),
-			m_volume.world2index_offset,
-			m_volume.world2index_scale,
-			m_render_aabb,
-			m_rng,
-			m_volume.albedo,
-			m_volume.scattering,
-			distance_scale,
-			m_volume.global_majorant,
-			m_up_dir,
-			m_sun_dir,
-			sky_col
-		);
+		m_volume.training.positions.data(),
+		m_volume.training.targets.data(),
+		m_volume.nanovdb_grid.data(),
+		m_volume.bitgrid.data(),
+		m_volume.world2index_offset,
+		m_volume.world2index_scale,
+		m_render_aabb,
+		m_rng,
+		m_volume.albedo,
+		m_volume.scattering,
+		distance_scale,
+		m_volume.global_majorant,
+		m_up_dir,
+		m_sun_dir,
+		sky_col
+	);
 	m_rng.advance(n_elements*256);
 
 	GPUMatrix<float> training_batch_matrix((float*)(m_volume.training.positions.data()), n_input_dims, batch_size);
@@ -243,7 +241,7 @@ __global__ void init_rays_volume(
 
 	Ray ray = pixel_to_ray(
 		sample_index,
-		{x, y},
+		{(int)x, (int)y},
 		resolution,
 		focal_length,
 		camera_matrix,
@@ -398,7 +396,7 @@ __global__ void volume_render_kernel_step(
 	if (extinction_prob>1.f) extinction_prob=1.f;
 	float T = 1.f - payload.col.a;
 	float alpha = extinction_prob * T;
-	payload.col.rgb += local_output.rgb * alpha;
+	payload.col.rgb() += local_output.rgb() * alpha;
 	payload.col.a += alpha;
 	if (payload.col.a > 0.99f || !walk_to_next_event(rng, aabb, pos, dir, bitgrid, scale) || force_finish_ray) {
 		payload.col += (1.f-payload.col.a) * proc_envmap_render(dir, up_dir, sun_dir, sky_col);
@@ -423,14 +421,14 @@ void Testbed::render_volume(
 	auto res = render_buffer.resolution;
 
 	size_t n_pixels = (size_t)res.x * res.y;
-	for (uint32_t i=0;i<2;++i) {
+	for (uint32_t i = 0; i < 2; ++i) {
 		m_volume.pos[i].enlarge(n_pixels);
 		m_volume.payload[i].enlarge(n_pixels);
 	}
 	m_volume.hit_counter.enlarge(2);
 	m_volume.hit_counter.memset(0);
 
-	vec3 sky_col = m_background_color.rgb;
+	vec3 sky_col = m_background_color.rgb();
 
 	const dim3 threads = { 16, 8, 1 };
 	const dim3 blocks = { div_round_up((uint32_t)res.x, threads.x), div_round_up((uint32_t)res.y, threads.y), 1 };
@@ -462,11 +460,11 @@ void Testbed::render_volume(
 		m_sun_dir,
 		sky_col
 	);
-	m_rng.advance(n_pixels*256);
+	m_rng.advance(n_pixels * 256);
 
-	uint32_t n=n_pixels;
-	CUDA_CHECK_THROW(cudaDeviceSynchronize());
-	cudaMemcpy(&n, m_volume.hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	uint32_t n = n_pixels;
+	CUDA_CHECK_THROW(cudaMemcpyAsync(&n, m_volume.hit_counter.data(), sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+	CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 
 	if (m_render_ground_truth) {
 		linear_kernel(volume_render_kernel_gt, 0, stream,
@@ -491,21 +489,21 @@ void Testbed::render_volume(
 			m_volume.scattering,
 			render_buffer.frame_buffer
 		);
-		m_rng.advance(n_pixels*256);
+		m_rng.advance(n_pixels * 256);
 	} else {
 		m_volume.radiance_and_density.enlarge(n);
 
 		int max_iter = 64;
-		for (int iter=0;iter<max_iter && n>0;++iter) {
-			uint32_t srcbuf=(iter&1);
-			uint32_t dstbuf=1-srcbuf;
+		for (int iter = 0; iter < max_iter && n > 0; ++iter) {
+			uint32_t srcbuf = (iter & 1);
+			uint32_t dstbuf = 1 - srcbuf;
 
-			uint32_t n_elements = next_multiple(n, tcnn::batch_size_granularity);
+			uint32_t n_elements = next_multiple(n, BATCH_SIZE_GRANULARITY);
 			GPUMatrix<float> positions_matrix((float*)m_volume.pos[srcbuf].data(), 3, n_elements);
 			GPUMatrix<float> densities_matrix((float*)m_volume.radiance_and_density.data(), 4, n_elements);
 			m_network->inference(stream, positions_matrix, densities_matrix);
 
-			cudaMemsetAsync(m_volume.hit_counter.data()+dstbuf,0,sizeof(uint32_t));
+			CUDA_CHECK_THROW(cudaMemsetAsync(m_volume.hit_counter.data() + dstbuf, 0, sizeof(uint32_t), stream));
 
 			linear_kernel(volume_render_kernel_step, 0, stream,
 				n,
@@ -533,11 +531,12 @@ void Testbed::render_volume(
 				render_buffer.frame_buffer,
 				(iter>=max_iter-1)
 			);
-			m_rng.advance(n_pixels*256);
-			if (((iter+1) % 4)==0) {
+
+			m_rng.advance(n_pixels * 256);
+			if (((iter + 1) % 4) == 0) {
 				// periodically tell the cpu how many pixels are left
-				CUDA_CHECK_THROW(cudaMemcpyAsync(&n, m_volume.hit_counter.data()+dstbuf, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
-				CUDA_CHECK_THROW(cudaDeviceSynchronize());
+				CUDA_CHECK_THROW(cudaMemcpyAsync(&n, m_volume.hit_counter.data() + dstbuf, sizeof(uint32_t), cudaMemcpyDeviceToHost, stream));
+				CUDA_CHECK_THROW(cudaStreamSynchronize(stream));
 			}
 		}
 	}
@@ -546,27 +545,27 @@ void Testbed::render_volume(
 #define NANOVDB_MAGIC_NUMBER 0x304244566f6e614eUL // "NanoVDB0" in hex - little endian (uint64_t)
 struct NanoVDBFileHeader
 {
-    uint64_t magic;     // 8 bytes
-    uint32_t version;   // 4 bytes version numbers
-    uint16_t gridCount; // 2 bytes
-    uint16_t codec;     // 2 bytes - must be 0
+	uint64_t magic;     // 8 bytes
+	uint32_t version;   // 4 bytes version numbers
+	uint16_t gridCount; // 2 bytes
+	uint16_t codec;     // 2 bytes - must be 0
 };
 static_assert(sizeof(NanoVDBFileHeader) == 16, "nanovdb padding error");
 
 struct NanoVDBMetaData
 {
-    uint64_t gridSize, fileSize, nameKey, voxelCount; // 4 * 8 = 32B.
-    uint32_t gridType;      // 4B.
-    uint32_t gridClass;     // 4B.
-    double worldBBox[2][3]; // 2 * 3 * 8 = 48B.
-    int indexBBox[2][3];    // 2 * 3 * 4 = 24B.
-    double voxelSize[3];    // 24B.
-    uint32_t nameSize;      // 4B.
-    uint32_t nodeCount[4];  // 4 x 4 = 16B
-    uint32_t tileCount[3];  // 3 x 4 = 12B
-    uint16_t codec;         // 2B
-    uint16_t padding;       // 2B, due to 8B alignment from uint64_t
-    uint32_t version;       // 4B
+	uint64_t gridSize, fileSize, nameKey, voxelCount; // 4 * 8 = 32B.
+	uint32_t gridType;      // 4B.
+	uint32_t gridClass;     // 4B.
+	double worldBBox[2][3]; // 2 * 3 * 8 = 48B.
+	int indexBBox[2][3];    // 2 * 3 * 4 = 24B.
+	double voxelSize[3];    // 24B.
+	uint32_t nameSize;      // 4B.
+	uint32_t nodeCount[4];  // 4 x 4 = 16B
+	uint32_t tileCount[3];  // 3 x 4 = 12B
+	uint16_t codec;         // 2B
+	uint16_t padding;       // 2B, due to 8B alignment from uint64_t
+	uint32_t version;       // 4B
 };
 static_assert(sizeof(NanoVDBMetaData) == 176, "nanovdb padding error");
 
@@ -617,7 +616,7 @@ void Testbed::load_volume(const fs::path& data_path) {
 		vec3{0.5f - xsize * scale * 0.5f, 0.5f - ysize * scale * 0.5f, 0.5f - zsize * scale * 0.5f},
 		vec3{0.5f + xsize * scale * 0.5f, 0.5f + ysize * scale * 0.5f, 0.5f + zsize * scale * 0.5f},
 	};
-	m_render_aabb_to_local = mat3(1.0f);
+	m_render_aabb_to_local = mat3::identity();
 
 	m_volume.world2index_scale = maxsize;
 	m_volume.world2index_offset = vec3{
@@ -639,7 +638,7 @@ void Testbed::load_volume(const fs::path& data_path) {
 			float fx = ((i + 0.5f) - m_volume.world2index_offset.x) / m_volume.world2index_scale;
 			float fy = ((j + 0.5f) - m_volume.world2index_offset.y) / m_volume.world2index_scale;
 			float fz = ((k + 0.5f) - m_volume.world2index_offset.z) / m_volume.world2index_scale;
-			uint32_t bitidx = tcnn::morton3D(int(fx * 128.0f + 0.5f), int(fy * 128.0f + 0.5f), int(fz * 128.0f + 0.5f));
+			uint32_t bitidx = morton3D(int(fx * 128.0f + 0.5f), int(fy * 128.0f + 0.5f), int(fz * 128.0f + 0.5f));
 			if (bitidx < 128 * 128 * 128)
 				bitgrid[bitidx / 8] |= 1 << (bitidx & 7);
 		}
@@ -650,4 +649,4 @@ void Testbed::load_volume(const fs::path& data_path) {
 	m_volume.global_majorant = mx;
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/thread_pool.cpp b/src/thread_pool.cpp
index 6939ecdba..146d372d5 100644
--- a/src/thread_pool.cpp
+++ b/src/thread_pool.cpp
@@ -16,7 +16,7 @@
 
 #include <chrono>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 ThreadPool::ThreadPool()
 : ThreadPool{std::thread::hardware_concurrency()} {}
@@ -98,4 +98,4 @@ void ThreadPool::flush_queue() {
 	m_task_queue.clear();
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/tinyexr_wrapper.cu b/src/tinyexr_wrapper.cu
index 00e451a81..0ff33118e 100644
--- a/src/tinyexr_wrapper.cu
+++ b/src/tinyexr_wrapper.cu
@@ -20,7 +20,7 @@
 
 #include <tiny-cuda-nn/gpu_memory.h>
 
-#ifdef __NVCC__
+#ifdef __CUDACC__
 #  ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
 #    pragma nv_diag_suppress 174
 #    pragma nv_diag_suppress 550
@@ -33,9 +33,7 @@
 #define TINYEXR_IMPLEMENTATION
 #include <tinyexr/tinyexr.h>
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 template <typename T>
 __global__ void interleave_and_cast_kernel(const uint32_t num_pixels, bool has_alpha, const T* __restrict__ in, __half* __restrict__ out, bool fix_pre_mult) {
@@ -257,4 +255,4 @@ __half* load_exr_to_gpu(int* width, int* height, const fs::path& path, bool fix_
 	return result;
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/tinyobj_loader_wrapper.cpp b/src/tinyobj_loader_wrapper.cu
similarity index 95%
rename from src/tinyobj_loader_wrapper.cpp
rename to src/tinyobj_loader_wrapper.cu
index 0a0fe8e69..5b844ef07 100644
--- a/src/tinyobj_loader_wrapper.cpp
+++ b/src/tinyobj_loader_wrapper.cu
@@ -14,7 +14,7 @@
  *          interface to load OBJ-based meshes.
  */
 
-#include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/common_host.h>
 #include <neural-graphics-primitives/tinyobj_loader_wrapper.h>
 
 #include <fmt/core.h>
@@ -22,10 +22,9 @@
 #define TINYOBJLOADER_IMPLEMENTATION
 #include <tinyobjloader/tiny_obj_loader.h>
 
-#include <iostream>
 #include <vector>
 
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 std::vector<vec3> load_obj(const fs::path& path) {
 	tinyobj::attrib_t attrib;
@@ -81,4 +80,4 @@ std::vector<vec3> load_obj(const fs::path& path) {
 	return result;
 }
 
-NGP_NAMESPACE_END
+}
diff --git a/src/triangle_bvh.cu b/src/triangle_bvh.cu
index 7f8092741..0e091bea6 100644
--- a/src/triangle_bvh.cu
+++ b/src/triangle_bvh.cu
@@ -12,8 +12,9 @@
  *  @author Thomas Müller & Alex Evans, NVIDIA
  */
 
-#include <neural-graphics-primitives/common.h>
+#include <neural-graphics-primitives/common_host.h>
 #include <neural-graphics-primitives/triangle_bvh.cuh>
+
 #include <tiny-cuda-nn/gpu_memory.h>
 
 #include <stack>
@@ -38,9 +39,7 @@ namespace optix_ptx {
 }
 #endif //NGP_OPTIX
 
-using namespace tcnn;
-
-NGP_NAMESPACE_BEGIN
+namespace ngp {
 
 constexpr float MAX_DIST = 10.0f;
 
@@ -409,7 +408,7 @@ public:
 		vec3 closest_point = tri.closest_point(point);
 		vec3 avg_normal = avg_normal_around_point(closest_point, bvhnodes, triangles);
 
-		return std::copysignf(p.second, dot(avg_normal, point - closest_point));
+		return copysign(p.second, dot(avg_normal, point - closest_point));
 	}
 
 	__host__ __device__ static float signed_distance_raystab(const vec3& point, const TriangleBvhNode* __restrict__ bvhnodes, const Triangle* __restrict__ triangles, float max_distance_sq, default_rng_t rng={}) {
@@ -543,7 +542,7 @@ public:
 
 		// Root
 		m_nodes.emplace_back();
-		m_nodes.front().bb = BoundingBox(std::begin(triangles), std::end(triangles));
+		m_nodes.front().bb = BoundingBox(triangles.data(), triangles.data() + triangles.size());
 
 		struct BuildNode {
 			int node_idx;
@@ -584,7 +583,7 @@ public:
 					}
 					var /= (float)std::distance(child.begin, child.end);
 
-					float max_val = compMax(var);
+					float max_val = max(var);
 					int axis = var.x == max_val ? 0 : (var.y == max_val ? 1 : 2);
 
 					auto m = child.begin + std::distance(child.begin, child.end)/2;
@@ -606,7 +605,7 @@ public:
 				child.node_idx = (int)m_nodes.size();
 
 				m_nodes.emplace_back();
-				m_nodes.back().bb = BoundingBox(child.begin, child.end);
+				m_nodes.back().bb = BoundingBox(&*child.begin, &*child.end);
 
 				if (std::distance(child.begin, child.end) <= n_primitives_per_leaf) {
 					m_nodes.back().left_idx = -(int)std::distance(std::begin(triangles), child.begin)-1;
@@ -721,6 +720,6 @@ __global__ void raytrace_kernel(uint32_t n_elements, vec3* __restrict__ position
 	}
 }
 
-NGP_NAMESPACE_END
+}