From 2cf8f62379029da943bf7e6735081740145c7b82 Mon Sep 17 00:00:00 2001
From: Giovanni Bajo <rasky@develer.com>
Date: Mon, 7 Aug 2023 16:39:08 +0200
Subject: [PATCH] VADPCM: complete stereo support

---
 src/audio/rsp_mixer.S          | 78 ++++++++++++++++++++--------------
 src/audio/wav64.c              | 57 +++++++++++++++++--------
 tools/audioconv64/conv_wav64.c | 30 ++++++-------
 3 files changed, 98 insertions(+), 67 deletions(-)

diff --git a/src/audio/rsp_mixer.S b/src/audio/rsp_mixer.S
index 4a6836b6f9..969290862f 100644
--- a/src/audio/rsp_mixer.S
+++ b/src/audio/rsp_mixer.S
@@ -1214,6 +1214,8 @@ VADPCM_STATE:    	.space 16*2   # state, left/right
 #define input_aligned       s6
 #define next_input_aligned  s7
 #define stereo_toggle       k1
+#define input_incr     		v0
+#define output_incr    		v1
 
 #define vscale  $v01
 #define vstatef $v02
@@ -1275,21 +1277,29 @@ VADPCM_Decompress:
 	addiu nframes, 1
 
 	li dmem_input, %lo(VADPCM_INPUT)
+	li input_incr, 9
 	li dmem_output, %lo(VADPCM_OUTPUT)
+	li output_incr, 32
 
+	bgez a2, VADPCM_FetchFirstFrame
+	move stereo_toggle, a2
+
+	# If the waveform is stereo, double increments
+	sll input_incr, 1
+	sll output_incr, 1
+
+VADPCM_FetchFirstFrame:
 	# Fetch the first frame. Wait for the transfer to finish
 	move s4, dmem_input
 	move s0, a0
 	jal DMAIn
-	li t0, DMA_SIZE(9, 1)
-	addiu a0, 9
+	addiu t0, input_incr, -1
+	add a0, input_incr
 
+	move next_input_aligned, s4
 	lqv vstate1, 0x00,dmem_state
 	lqv vstate3, 0x10,dmem_state
 
-	move next_input_aligned, s4
-	move stereo_toggle, a2	
-
 	##################################################################
 
 VADPCM_DecompressLoop:
@@ -1302,11 +1312,11 @@ VADPCM_DecompressLoop:
 	xori dmem_input, 32   # swap
 	mtc0 dmem_input, COP0_DMA_SPADDR
 	mtc0 a0, COP0_DMA_RAMADDR
-	li t0, DMA_SIZE(9, 1)
+	addiu t0, input_incr, -1
 	mtc0 t0, COP0_DMA_READ
 	andi t0, a0, 7
 	add next_input_aligned, dmem_input, t0
-	addiu a0, 9
+	add a0, input_incr
 
 VADPCM_DecompressMono:
 	# Read from input: control byte, plus residuals
@@ -1387,43 +1397,47 @@ VADPCM_DecompressMono:
 	# Stereo deinterleave
 	addiu dmem_output, -64
 
+	lqv vstate0, 0x00,dmem_output
+	lqv vstate1, 0x10,dmem_output
+	lqv vstate2, 0x20,dmem_output
+	lqv vstate3, 0x30,dmem_output
 VADPCM_InterleaveStereo:
-	lqv vstate1, 0x00,dmem_output
-	lqv vstate3, 0x20,dmem_output
-	ssv vstate1.e0, 0x00,dmem_output
-	ssv vstate3.e0, 0x02,dmem_output
-	ssv vstate1.e1, 0x04,dmem_output
-	ssv vstate3.e1, 0x06,dmem_output
-	ssv vstate1.e2, 0x08,dmem_output
-	ssv vstate3.e2, 0x0A,dmem_output
-	ssv vstate1.e3, 0x0C,dmem_output
-	ssv vstate3.e3, 0x0E,dmem_output
-
-	ssv vstate1.e4, 0x10,dmem_output
-	ssv vstate3.e4, 0x12,dmem_output
-	ssv vstate1.e5, 0x14,dmem_output
-	ssv vstate3.e5, 0x16,dmem_output
-	ssv vstate1.e6, 0x18,dmem_output
-	ssv vstate3.e6, 0x1A,dmem_output
-	ssv vstate1.e7, 0x1C,dmem_output
-	ssv vstate3.e7, 0x1E,dmem_output
-
-	addiu dmem_output, 0x10
+	ssv vstate0.e0, 0x00,dmem_output
+	ssv vstate2.e0, 0x02,dmem_output
+	ssv vstate0.e1, 0x04,dmem_output
+	ssv vstate2.e1, 0x06,dmem_output
+	ssv vstate0.e2, 0x08,dmem_output
+	ssv vstate2.e2, 0x0A,dmem_output
+	ssv vstate0.e3, 0x0C,dmem_output
+	ssv vstate2.e3, 0x0E,dmem_output
+
+	ssv vstate0.e4, 0x10,dmem_output
+	ssv vstate2.e4, 0x12,dmem_output
+	ssv vstate0.e5, 0x14,dmem_output
+	ssv vstate2.e5, 0x16,dmem_output
+	ssv vstate0.e6, 0x18,dmem_output
+	ssv vstate2.e6, 0x1A,dmem_output
+	ssv vstate0.e7, 0x1C,dmem_output
+	ssv vstate2.e7, 0x1E,dmem_output
+
+	vcopy vstate0, vstate1
+	vcopy vstate2, vstate3
+	addiu dmem_output, 0x20
 	bltz stereo_toggle, VADPCM_InterleaveStereo 
 	xor stereo_toggle, a2
-	addiu dmem_output, -0x20
+	addiu dmem_output, -0x40
 
 	# Write output into RDRAM (async)
 VADPCM_Output:
 	mfc0 t0, COP0_DMA_FULL
 	bnez t0, VADPCM_Output
-	li t0, DMA_SIZE(32, 1)
+	addiu t0, output_incr, -1
 
 	mtc0 dmem_output, COP0_DMA_SPADDR
 	mtc0 a1, COP0_DMA_RAMADDR
 	mtc0 t0, COP0_DMA_WRITE 
 
-	addiu a1, 32
+	add a1, output_incr
 	xori dmem_output, 128   # swap
 
 	addiu nframes, -1
@@ -1432,8 +1446,6 @@ VADPCM_Output:
 
 	##################################################################
 
-	emux_trace_stop
-
 	# Save back state
 	sqv vstate1, 0x00,dmem_state
 	sqv vstate3, 0x10,dmem_state
diff --git a/src/audio/wav64.c b/src/audio/wav64.c
index 8575ec0bd1..5564efaaa3 100644
--- a/src/audio/wav64.c
+++ b/src/audio/wav64.c
@@ -23,7 +23,7 @@
 #include <limits.h>
 #include <stdalign.h>
 
-/** @brief Set to 1 to use the refernece C decode for VADPCM */
+/** @brief Set to 1 to use the reference C decode for VADPCM */
 #define VADPCM_REFERENCE_DECODER     0
 
 /** ID of a standard WAV file */
@@ -138,10 +138,6 @@ static inline void rsp_vadpcm_decompress(void *input, int16_t *output, bool ster
 	wav64_vadpcm_vector_t *state, wav64_vadpcm_vector_t *codebook)
 {
 	assert(nframes > 0 && nframes <= 256);
-	if (stereo) {
-		assert(nframes % 2 == 0);
-		nframes /= 2;
-	}
 	rspq_write(__mixer_overlay_id, 0x1,
 		PhysicalAddr(input), 
 		PhysicalAddr(output) | (nframes-1) << 24,
@@ -196,29 +192,54 @@ static void waveform_vadpcm_read(void *ctx, samplebuffer_t *sbuf, int wpos, int
 		assert(nframes <= 256);
 		nframes = MIN(nframes, 256);
 
+		// Acquire destination buffer from the sample buffer
 		int16_t *dest = (int16_t*)samplebuffer_append(sbuf, nframes*16);
 
-		assert((nframes & 1) == 0);
-		dma_read(dest + wlen - 9*nframes/2, vhead->current_rom_addr, 9*nframes);
+		// Calculate source pointer at the end of the destination buffer.
+		// VADPCM decoding can be safely made in-place, so no auxillary buffer
+		// is necessary.
+		int src_bytes = 9 * nframes * wav->wave.channels;
+		void *src = (void*)dest + ((nframes*16) << SAMPLES_BPS_SHIFT(sbuf)) - src_bytes;
+
+		// Fetch compressed data
+		dma_read(src, vhead->current_rom_addr, src_bytes);
+		vhead->current_rom_addr += src_bytes;
 
 		#if VADPCM_REFERENCE_DECODER
-		vadpcm_error err = vadpcm_decode(
-			vhead->npredictors, vhead->order, vhead->codebook, &vhead->state,
-			nframes, dest, dest + wlen - 9*nframes/2);
-		assertf(err == 0, "VADPCM decoding error: %d\n", err);
+		if (wav->wave.channels == 1) {
+			vadpcm_error err = vadpcm_decode(
+				vhead->npredictors, vhead->order, vhead->codebook, vhead->state,
+				nframes, dest, src);
+			assertf(err == 0, "VADPCM decoding error: %d\n", err);
+		} else {
+			assert(wav->wave.channels == 2);
+			int16_t uncomp[2][16];
+			int16_t *dst = dest;
+
+			for (int i=0; i<nframes; i++) {
+				for (int j=0; j<2; j++) {
+					vadpcm_error err = vadpcm_decode(
+						vhead->npredictors, vhead->order, vhead->codebook + 8*j, &vhead->state[j],
+						1, uncomp[j], src);
+					assertf(err == 0, "VADPCM decoding error: %d\n", err);
+					src += 9;
+				}
+				for (int j=0; j<16; j++) {
+					*dst++ = uncomp[0][j];
+					*dst++ = uncomp[1][j];
+				}
+			}
+		}
 		#else
 		// Switch to highpri as late as possible
 		if (!highpri) {
 			rspq_highpri_begin();
 			highpri = true;
 		}
-		rsp_vadpcm_decompress(dest + wlen - 9*nframes/2, dest, wav->wave.channels==2, nframes, 
-			&vhead->state, vhead->codebook);
+		rsp_vadpcm_decompress(src, dest, wav->wave.channels==2, nframes, vhead->state, vhead->codebook);
 		#endif
 
-		vhead->current_rom_addr += 9*nframes;
 		wlen -= 16*nframes;
-		dest += 16*nframes;
 	}
 
 	if (highpri)
@@ -260,9 +281,11 @@ void wav64_open(wav64_t *wav, const char *fn) {
 		wav64_header_vadpcm_t vhead = {0};
 		dfs_read(&vhead, 1, sizeof(vhead), fh);
 
-		void *ext = malloc_uncached(sizeof(vhead) + vhead.npredictors * vhead.order * sizeof(wav64_vadpcm_vector_t));
+		int codebook_size = vhead.npredictors * vhead.order * head.channels * sizeof(wav64_vadpcm_vector_t);
+
+		void *ext = malloc_uncached(sizeof(vhead) + codebook_size);
 		memcpy(ext, &vhead, sizeof(vhead));
-		dfs_read(ext + sizeof(vhead), 1, vhead.npredictors * vhead.order * sizeof(wav64_vadpcm_vector_t), fh);
+		dfs_read(ext + sizeof(vhead), 1, codebook_size, fh);
 		wav->ext = ext;
 		wav->wave.read = waveform_vadpcm_read;
 		wav->wave.ctx = wav;
diff --git a/tools/audioconv64/conv_wav64.c b/tools/audioconv64/conv_wav64.c
index db5e63615d..00e7580e12 100644
--- a/tools/audioconv64/conv_wav64.c
+++ b/tools/audioconv64/conv_wav64.c
@@ -24,11 +24,6 @@ int wav_convert(const char *infn, const char *outfn) {
 		fprintf(stderr, "Converting: %s => %s (%d bits, %d Hz, %d channels, %s)\n", infn, outfn,
 			wav.bitsPerSample, wav.sampleRate, wav.channels, flag_wav_compress ? "vadpcm" : "raw");
 
-	if (flag_wav_compress == 1 && wav.channels != 1) {
-		fprintf(stderr, "ERROR: VADPCM compression only support mono files\n");
-		return 1;
-	}
-
 	// Decode the samples as 16bit big-endian. This will decode everything including
 	// compressed formats so that we're able to read any kind of WAV file, though
 	// it will end up as an uncompressed file.
@@ -114,32 +109,33 @@ int wav_convert(const char *infn, const char *outfn) {
 	} break;
 
 	case 1: { // vadpcm
-		if ((cnt / wav.channels) % kVADPCMFrameSampleCount) {
-			int newcnt = (cnt / wav.channels + kVADPCMFrameSampleCount - 1) / kVADPCMFrameSampleCount * kVADPCMFrameSampleCount * wav.channels;
-			samples = realloc(samples, newcnt * sizeof(int16_t));
-			memset(samples + cnt, 0, (newcnt - cnt) * sizeof(int16_t));
+		if (cnt % kVADPCMFrameSampleCount) {
+			int newcnt = (cnt + kVADPCMFrameSampleCount - 1) / kVADPCMFrameSampleCount * kVADPCMFrameSampleCount;
+			samples = realloc(samples, newcnt * wav.channels * sizeof(int16_t));
+			memset(samples + cnt, 0, (newcnt - cnt) * wav.channels * sizeof(int16_t));
 			cnt = newcnt;
 		}
 
 		enum { kPREDICTORS = 4 };
 
+		assert(cnt % kVADPCMFrameSampleCount == 0);
 		int nframes = cnt / kVADPCMFrameSampleCount;
 		void *scratch = malloc(vadpcm_encode_scratch_size(nframes));
 		struct vadpcm_vector *codebook = alloca(kPREDICTORS * kVADPCMEncodeOrder * wav.channels * sizeof(struct vadpcm_vector));
 		struct vadpcm_params parms = { .predictor_count = kPREDICTORS };
-		void *dest = malloc(nframes * kVADPCMFrameByteSize);
+		void *dest = malloc(nframes * kVADPCMFrameByteSize * wav.channels);
 		
-		int16_t *schan = malloc(cnt / wav.channels * sizeof(int16_t));;
+		int16_t *schan = malloc(cnt * sizeof(int16_t));
 		uint8_t *destchan = dest;
-		for (int i=0;i<wav.channels;i++) {
-			for (int j=0;j<cnt;j+=wav.channels)
-				schan[j/wav.channels] = samples[j+i];
-			vadpcm_error err = vadpcm_encode(&parms, codebook + kPREDICTORS * kVADPCMEncodeOrder * i, nframes / wav.channels, destchan, schan, scratch);
+		for (int i=0; i<wav.channels; i++) {
+			for (int j=0; j<cnt; j++)
+				schan[j] = samples[i + j*wav.channels];
+			vadpcm_error err = vadpcm_encode(&parms, codebook + kPREDICTORS * kVADPCMEncodeOrder * i, nframes, destchan, schan, scratch);
 			if (err != 0) {
 				fprintf(stderr, "VADPCM encoding error: %s\n", vadpcm_error_name(err));
 				return 1;
 			}
-			destchan += nframes / wav.channels * kVADPCMFrameByteSize;
+			destchan += nframes * kVADPCMFrameByteSize;
 		}
 
 		struct vadpcm_vector state = {0};
@@ -157,7 +153,7 @@ int wav_convert(const char *infn, const char *outfn) {
 		w32_at(out, wstart_offset, ftell(out));
 		for (int i=0;i<nframes;i++) {
 			for (int j=0;j<wav.channels;j++)
-				fwrite(dest + (j * (nframes / wav.channels) + i) * kVADPCMFrameByteSize, 1, kVADPCMFrameByteSize, out);
+				fwrite(dest + (j * nframes + i) * kVADPCMFrameByteSize, 1, kVADPCMFrameByteSize, out);
 		}
 		free(dest);
 		free(scratch);