From b9cf301b10c464f4e31813fccc91c5755efa1276 Mon Sep 17 00:00:00 2001
From: Thomas Rouch <thom01.rouch@gmail.com>
Date: Thu, 23 Jun 2022 14:59:24 +0200
Subject: [PATCH] :bug: take 'dataset scale' into account when using
 orthographic camera

---
 .../common_device.cuh                         | 22 ++++++++++++++-----
 include/neural-graphics-primitives/testbed.h  |  3 ++-
 src/testbed.cu                                |  9 +++++---
 src/testbed_nerf.cu                           | 15 ++++++++-----
 4 files changed, 34 insertions(+), 15 deletions(-)
diff --git a/include/neural-graphics-primitives/common_device.cuh b/include/neural-graphics-primitives/common_device.cuh
index 0ad58fc8e..9ff5be99f 100644
--- a/include/neural-graphics-primitives/common_device.cuh
+++ b/include/neural-graphics-primitives/common_device.cuh
@@ -273,7 +273,8 @@ inline __host__ __device__ Ray pixel_to_ray(
 	const ECameraMode camera_mode = ECameraMode::Perspective,
 	const CameraDistortion& camera_distortion = {},
 	const float* __restrict__ distortion_data = nullptr,
-	const Eigen::Vector2i distortion_resolution = Eigen::Vector2i::Zero()
+	const Eigen::Vector2i distortion_resolution = Eigen::Vector2i::Zero(),
+	const float dataset_scale = 1.f
 ) {
 	Eigen::Vector2f offset = ld_random_pixel_offset(snap_to_pixel_centers ? 0 : spp);
 	Eigen::Vector2f uv = (pixel.cast<float>() + offset).cwiseQuotient(resolution.cast<float>());
@@ -283,12 +284,16 @@ inline __host__ __device__ Ray pixel_to_ray(
 
 	Eigen::Vector3f head_pos;
 	if(camera_mode == ECameraMode::Orthographic){
+		// 'dataset_scale' argument is only required by the orthographic camera.
+		// The focal length of Environment and Perspective cameras isn't affected by the change of dataset_scale,
+		// because all rays originate from the same point
 		dir = {0.f, 0.f, 1.f}; // Camera forward
 		head_pos = {
 			(uv.x() - screen_center.x()) * (float)resolution.x() / focal_length.x(),
 			(uv.y() - screen_center.y()) * (float)resolution.y() / focal_length.y(),
 			0.0f
 		};
+		head_pos *= dataset_scale;
 		head_pos += shift;
 		dir -= shift / parallax_shift.z(); // we could use focus_z here in the denominator. for now, we pack m_scale in here.
 	}
@@ -354,7 +359,8 @@ inline __host__ __device__ Eigen::Vector2f pos_to_pixel(
 	const Eigen::Vector2f& screen_center,
 	const Eigen::Vector3f& parallax_shift,
 	const ECameraMode camera_mode,
-	const CameraDistortion& camera_distortion = {}
+	const CameraDistortion& camera_distortion = {},
+	const float dataset_scale = 1.f
 ) {
 	// We get 'pos' as an input. We have pos = origin + alpha*dir, with unknown alpha
 	// tmp_dir = R^-1*(pos-t)
@@ -368,7 +374,8 @@ inline __host__ __device__ Eigen::Vector2f pos_to_pixel(
 		// origin = R*(head_pos+shift) + t
 		tmp_dir -= shift;
 		const Eigen::Vector3f head_dir_minus_shift = Eigen::Vector3f(0.f, 0.f, 1.f) - shift/parallax_shift.z();
-		const Eigen::Vector3f head_pos = tmp_dir - tmp_dir.z() * head_dir_minus_shift; // Gives head_pos.z=0 since head_dir_minus_shift.z=1
+		Eigen::Vector3f head_pos = tmp_dir - tmp_dir.z() * head_dir_minus_shift; // Gives head_pos.z=0 since head_dir_minus_shift.z=1
+		head_pos /= dataset_scale;
 		return {
 			head_pos.x() * focal_length.x() + screen_center.x() * resolution.x(),
 			head_pos.y() * focal_length.y() + screen_center.y() * resolution.y(),
@@ -426,7 +433,8 @@ inline __host__ __device__ Eigen::Vector2f motion_vector_3d(
 	const bool snap_to_pixel_centers,
 	const float depth,
 	const ECameraMode camera_mode,
-	const CameraDistortion& camera_distortion = {}
+	const CameraDistortion& camera_distortion = {},
+	const float dataset_scale = 1.f
 ) {
 	Ray ray = pixel_to_ray(
 		sample_index,
@@ -442,7 +450,8 @@ inline __host__ __device__ Eigen::Vector2f motion_vector_3d(
 		camera_mode,
 		camera_distortion,
 		nullptr,
-		Eigen::Vector2i::Zero()
+		Eigen::Vector2i::Zero(),
+		dataset_scale
 	);
 
 	Eigen::Vector2f prev_pixel = pos_to_pixel(
@@ -453,7 +462,8 @@ inline __host__ __device__ Eigen::Vector2f motion_vector_3d(
 		screen_center,
 		parallax_shift,
 		camera_mode,
-		camera_distortion
+		camera_distortion,
+		dataset_scale
 	);
 
 	return prev_pixel - (pixel.cast<float>() + ld_random_pixel_offset(sample_index));
diff --git a/include/neural-graphics-primitives/testbed.h b/include/neural-graphics-primitives/testbed.h
index 9aa7a7e95..dcc5df818 100644
--- a/include/neural-graphics-primitives/testbed.h
+++ b/include/neural-graphics-primitives/testbed.h
@@ -157,7 +157,8 @@ class Testbed {
 			float cone_angle_constant,
 			ERenderMode render_mode,
 			ECameraMode camera_mode,
-			cudaStream_t stream
+			cudaStream_t stream,
+			float dataset_scale
 		);
 
 		uint32_t trace(
diff --git a/src/testbed.cu b/src/testbed.cu
index eabf6fced..bd0bf3508 100644
--- a/src/testbed.cu
+++ b/src/testbed.cu
@@ -2450,7 +2450,8 @@ __global__ void dlss_prep_kernel(
 	const Vector2f image_pos,
 	const Vector2f prev_image_pos,
 	const Vector2i image_resolution,
-	const ECameraMode camera_mode
+	const ECameraMode camera_mode,
+	const float dataset_scale = 1.f
 ) {
 	uint32_t x = threadIdx.x + blockDim.x * blockIdx.x;
 	uint32_t y = threadIdx.y + blockDim.y * blockIdx.y;
@@ -2489,7 +2490,8 @@ __global__ void dlss_prep_kernel(
 		snap_to_pixel_centers,
 		depth,
 		camera_mode,
-		camera_distortion
+		camera_distortion,
+		dataset_scale
 	);
 
 	surf2Dwrite(make_float2(mvec.x(), mvec.y()), mvec_surface, x_orig * sizeof(float2), y_orig);
@@ -2652,7 +2654,8 @@ void Testbed::render_frame(const Matrix<float, 3, 4>& camera_matrix0, const Matr
 			m_image.pos,
 			m_image.prev_pos,
 			m_image.resolution,
-			m_camera_mode
+			m_camera_mode,
+			m_nerf.training.dataset.scale
 		);
 
 		render_buffer.set_dlss_sharpening(m_dlss_sharpening);
diff --git a/src/testbed_nerf.cu b/src/testbed_nerf.cu
index 5b1bc206f..b0d34964a 100644
--- a/src/testbed_nerf.cu
+++ b/src/testbed_nerf.cu
@@ -1791,7 +1791,8 @@ __global__ void init_rays_with_payload_kernel_nerf(
 	const float* __restrict__ distortion_data,
 	const Vector2i distortion_resolution,
 	ERenderMode render_mode,
-	ECameraMode camera_mode
+	ECameraMode camera_mode,
+	float dataset_scale
 ) {
 	uint32_t x = threadIdx.x + blockDim.x * blockIdx.x;
 	uint32_t y = threadIdx.y + blockDim.y * blockIdx.y;
@@ -1825,7 +1826,8 @@ __global__ void init_rays_with_payload_kernel_nerf(
 		camera_mode,
 		camera_distortion,
 		distortion_data,
-		distortion_resolution
+		distortion_resolution,
+		dataset_scale
 	);
 
 	NerfPayload& payload = payloads[idx];
@@ -1973,7 +1975,8 @@ void Testbed::NerfTracer::init_rays_from_camera(
 	float cone_angle_constant,
 	ERenderMode render_mode,
 	ECameraMode camera_mode,
-	cudaStream_t stream
+	cudaStream_t stream,
+	float dataset_scale
 ) {
 	// Make sure we have enough memory reserved to render at the requested resolution
 	size_t n_pixels = (size_t)resolution.x() * resolution.y();
@@ -2004,7 +2007,8 @@ void Testbed::NerfTracer::init_rays_from_camera(
 		distortion_data,
 		distortion_resolution,
 		render_mode,
-		camera_mode
+		camera_mode,
+		dataset_scale
 	);
 
 	m_n_rays_initialized = resolution.x() * resolution.y();
@@ -2268,7 +2272,8 @@ void Testbed::render_nerf(CudaRenderBuffer& render_buffer, const Vector2i& max_r
 		m_nerf.cone_angle_constant,
 		render_mode,
 		m_camera_mode,
-		stream
+		stream,
+		m_nerf.training.dataset.scale
 	);
 
 	uint32_t n_hit;