lib/util/DataReader.v3

// Copyright 2019 Ben L. Titzer. All rights reserved.
// See LICENSE for details of Apache 2.0 license.

// Utility for reading data items from a stream of bytes, offering routines to
// read bytes, integers, LEBs, etc. A data reader has state for whether an
// error occurred, keeping the first (earliest position) error. Read operations
// only succeed if all of the data they expect is in the current buffer and
// instead signal an end-of-file error if they would go past the end of the
// current buffer. When the caller has more data to supply, they can use the
// {append()} and {divest()} calls add data to this buffer while minding proper
// aliasing.
class DataReader {
	def var data: Range<byte>;		// array containing data
	def var pos: int;			// current position
	def var limit: int = data.length;	// read limit within array
	var startAddr: u64;			// address of pos = 0

	def var ok: bool = true;		// true if no error
	def var error_pos: int = int.max;	// first error position
	def var error_msg: string;		// error message

	var onEOF = DataReader.setEOFError;	// EOF callback
	var onError = DataReader.setFirstError;	// error callback

	new(data) { }

	// Return {true} if the input has been completely consumed.
	def done() -> bool {
		return pos >= limit;
	}
	// Return {true} if more input remains, i.e. {!done()}.
	def more() -> bool {
		return pos < limit;
	}
	// Return the number of available bytes.
	def available() -> int {
		var diff = limit - pos;
		return if(diff > 0, diff);
	}
	// Set the current position to be {npos}.
	def at(npos: int) -> this {
		pos = npos;
	}
	// Set the current position to be the same as {this.limit}.
	def atLimit() -> this {
		pos = limit;
	}
	// Set the current position and limit to be {npos} and {nlimit}.
	def atl(npos: int, nlimit: int) -> this {
		pos = npos;
		limit = nlimit;
	}
	// Calculate the address of the current position.
	def addr() -> u64 {
		return u64.!(pos) + startAddr;
	}
	// Calculate the address of a position {pos}.
	def addr_of(pos: int) -> u64 {
		return u64.!(pos) + startAddr;
	}
	// Reset the current data, position, and limit.
	def reset(ndata: Range<byte>, npos: int, nlimit: int) -> this {
		startAddr = 0;
		data = ndata;
		pos = npos;
		limit = nlimit;
	}
	// Record an error with the specified message {msg}.
	def fail(msg: string) -> this {
		fail_at(pos, msg);
	}
	// Record an error at {pos} with the specified message {msg}.
	def fail_at(pos: int, msg: string) -> this {
		onError(this, pos, msg);
	}
	// Record end-of-file for a read beginning at the position {pos}, with message {msg}.
	def eof_at(pos: int, size: int, msg: string) -> this {
		onEOF(this, pos, size, Strings.format1(msg, size));
	}
	// Peek a single byte, if available, or {-1} if EOF.
	def peek1() -> int {
		return if(pos >= limit, -1, data[pos]);
	}
	// Check if {length} bytes can be read.
	def peekN(length: int) -> bool {
		return (limit - pos) >= length;
	}
	// Consume the next byte if it matches {b} and return {true}.
	def match1(b: byte) -> bool {
		if (pos < limit && data[pos] == b) {
			pos++;
			return true;
		}
		return false;
	}
	// Check that a string of {length} bytes could be read and record an EOF if not possible.
	def checkN(length: int) -> bool {
		if (limit - pos < length) {
			eof_at(pos, length, "expecting %d bytes");
			return false;
		}
		return true;
	}
	// Skip {length} bytes.
	def skipN(length: int) {
		if (limit - pos < length) {
			eof_at(pos, length, "expecting %d bytes");
			return;
		}
		pos += length;
	}
	// Skip an LEB.
	def skip_leb() {
		while (pos < limit) {
			var b = data[pos++];
			if ((b & 0x80) == 0) break;
		}
	}
	// Read a single byte.
	def read1() -> byte {
		if (pos >= limit) {
			eof_at(pos, 1, "expected byte");
			return 0;
		}
		return data[pos++];
	}
	// Read {len} bytes and return an internal alias to bytes which store them.
	def acquire(len: int) -> Range<byte> {
		if (limit - pos < len) {
			eof_at(pos, len, "expected %d bytes");
			return null;
		}
		return data[pos ..+ len];
	}
	// Read a 4-byte little-endian unsigned integer.
	def read_u32() -> u32 {
		var range = acquire(4);
		if (range.length != 4) return 0;
		pos += 4;
		return DataReaders.read_range_u32(range);
	}
	// Read an 8-byte little-endian unsigned integer.
	def read_u64() -> u64 {
		var range = acquire(8);
		if (range.length != 8) return 0;
		pos += 8;
		return DataReaders.read_range_u64(range);
	}
	// Read an unsigned 32-bit LEB-encoded integer.
	def read_uleb32() -> u32 {
		var b = read1();
		if (b < 0x80) return b;
		return read_xleb32(b, false, "LEB out of unsigned 32-bit range");
	}
	// Read a signed 32-bit LEB-encoded integer.
	def read_sleb32() -> i32 {
		var b = read1();
		if (b < 0x80) return i7.view(b);
		return int.view(read_xleb32(b, true, "LEB out of signed 32-bit range"));
	}
	private def check_leb_ext(b: byte, signed: bool, shift: u3, msg: string) {
		if ((b & 0x80) != 0) {
			fail("overlong LEB");
			return;
		}
		if (signed) {
			var upper = i7.view(b) >> u3.view(shift - 1);
			if (upper != 0 && upper != -1) fail(msg);
		} else {
			var upper = i7.view(b) >> shift;
			if (upper != 0) fail(msg);
		}
	}
	private def read_xleb32(first: byte, signed: bool, msg: string) -> u32 {
		var result = u32.view(first) & 0x7Fu, p = pos, shift = 7;
		while (p < limit) {
			var b = data[p++];
			result = result | u32.view(b & 0x7F) << u5.view(shift);
			shift += 7;
			if (shift == 35) { // consumed (special) 5th byte
				// check upper bits of last byte and extension
				check_leb_ext(b, signed, 4, msg);
				pos = p;
				return result;
			}
			if (b < 0x80) { // no continuation byte
				if (signed) { // perform sign-extension
					var rem = u5.view(0 - shift);
					result = u32.view(i32.view(result << rem) >> rem);
				}
				pos = p;
				return result;
			}
		}
		eof_at(pos, 0, "unterminated LEB");
		return 0;
	}
	// Read an unsigned 64-bit LEB-encoded integer.
	def read_uleb64() -> u64 {
		var b = read1();
		if (b < 0x80) return b;
		return read_xleb64(b, false, "LEB out of unsigned 64-bit range");
	}
	// Read a signed 64-bit LEB-encoded integer.
	def read_sleb64() -> i64 {
		var b = read1();
		if (b < 0x80) return i7.view(b);
		return i64.view(read_xleb64(b, true, "LEB out of signed 64-bit range"));
	}
	private def read_xleb64(first: byte, signed: bool, msg: string) -> u64 {
		var result: u64 = u32.view(first) & 0x7Fu, p = pos, shift = 7;
		while (p < limit) {
			var b = data[p++];
			result = result | u64.view(b & 0x7F) << u6.view(shift);
			shift += 7;
			if (shift == 70) { // consumed (special) 10th byte
				// check upper bits of last byte and extension
				check_leb_ext(b, signed, 1, msg);
				pos = p;
				return result;
			}
			if (b < 0x80) { // no continuation byte
				if (signed) { // perform sign-extension
					var rem = u6.view(0 - shift);
					result = u64.view(i64.view(result << rem) >> rem);
				}
				pos = p;
				return result;
			}
		}
		eof_at(pos, 0, "unterminated LEB");
		return 0;

	}
	// Read {length} bytes, copying them into a new array.
	def readN(length: int) -> Array<byte> {
		if (limit - pos < length) {
			eof_at(pos, length, "expecting %d bytes");
			return [];
		}
		var result = Ranges.dup(data[pos ..+ length]);
		pos += length;
		return result;
	}
	// Append more data to the internal buffer. Reuse the provided
	// buffer {x}, i.e. *alias* it, if there is no remaining data in the
	// internal buffer. The caller may specify an {expecting} argument that
	// it is expecting a given number of bytes, and thus this method should
	// preallocate more space to avoid future reallocations.
	def putk(expecting: int, x: Array<byte>, xpos: int, xlength: int) -> this {
		if ((data.length - limit) >= xlength) {
			// Enough space to copy into this buffer.
			for (i < xlength) data[limit + i] = x[xpos + i];
			limit += xlength;
			return;
		}
		var avail = available(), diff = 0;
		if (avail == 0 && x.length >= expecting) {
			// Reuse {x} in-place by aliasing it.
			diff = pos - xpos;
			data = x;
			pos = xpos;
			limit = xpos + xlength;
		} else {
			// Not enough space. Copy the old data and the new data
			// together into a new buffer.
			var combined = avail + xlength;
			var nlength = if(expecting > combined, expecting, combined);
			var nbuf = Array<byte>.new(nlength);
			for (i < avail) nbuf[i] = data[pos + i];
			for (i < xlength) nbuf[avail + i] = x[xpos + i];

			diff = pos;
			data = nbuf;
			pos = 0;
			limit = combined;
		}
		startAddr += u64.view(diff);
	}
	// Append more data to the internal buffer.
	def putr(expecting: int, x: Range<byte>) -> this {
		if ((data.length - limit) >= x.length) {
			// Enough space to copy into this buffer.
			for (i < x.length) data[limit + i] = x[i];
			limit += x.length;
			return;
		}
		var avail = available(), diff = 0;
		// Copy the old data and the new data together into a new buffer.
		var combined = avail + x.length;
		var nlength = if(expecting > combined, expecting, combined);
		var nbuf = Array<byte>.new(nlength);
		for (i < avail) nbuf[i] = data[pos + i];
		for (i < x.length) nbuf[avail + i] = x[i];

		diff = pos;
		data = nbuf;
		pos = 0;
		limit = combined;

		startAddr += u64.view(diff);
	}
	// Populate the given reader {d} with the range of bytes from {this.pos} to
	// {this.pos + length}.
	def sub_bytes(d: DataReader, length: int) {
		if (limit - pos < length) {
			eof_at(pos, length, "expecting %d bytes");
			if (pos < limit) pos = limit;  // consume remaining
			return;
		}
		d.data = data;
		d.pos = this.pos;
		d.limit = this.pos + length;
		d.startAddr = this.startAddr;
		this.pos += length;
	}
	// Set an error on EOF.
	def setEOFError(pos: int, size: int, msg: string) {
		onError(this, pos, msg);
	}
	// Record the first error for this reader.
	def setFirstError(pos: int, msg: string) {
		if (pos < error_pos) {
			error_pos = pos;
			error_msg = msg;
			ok = false;
		}
	}
}
def EMPTY_BYTES: Array<byte> = []; // used instead of null

// Optimized utility routines for decoding data in byte ranges without the overhead of constructing
// a {Decoder} object. Uses layouts to avoid byte-by-byte reads, both combining multiple bounds
// checks and using native (full word) reads when possible.
component DataReaders {
	def read_range_u32_u8(r: Range<byte>)	-> u32 { return r[0]; }
	def read_range_u32_u16(r: Range<byte>)	-> u32 { return Ref<Layout_u16>.of(r).val; }
	def read_range_u32_i8(r: Range<byte>)	-> u32 { return u32.view(i8.view(r[0])); }
	def read_range_u32_i16(r: Range<byte>)	-> u32 { return u32.view(i16.view(Ref<Layout_u16>.of(r).val)); }
	def read_range_u64_u8(r: Range<byte>)	-> u64 { return r[0]; }
	def read_range_u64_u16(r: Range<byte>)	-> u64 { return Ref<Layout_u16>.of(r).val; }
	def read_range_u64_u32(r: Range<byte>)	-> u64 { return Ref<Layout_u32>.of(r).val; }
	def read_range_u64_i8(r: Range<byte>)	-> u64 { return u64.view(i8.view(r[0])); }
	def read_range_u64_i16(r: Range<byte>)	-> u64 { return u64.view(i16.view(Ref<Layout_u16>.of(r).val)); }
	def read_range_u64_i32(r: Range<byte>)	-> u64 { return u64.view(i32.view(Ref<Layout_u32>.of(r).val)); }

	def read_range_i32_i8(r: Range<byte>)	-> i32 { return i32.view(i8.view(r[0])); }
	def read_range_i32_i16(r: Range<byte>)	-> i32 { return i32.view(i16.view(Ref<Layout_u16>.of(r).val)); }
	def read_range_i64_i8(r: Range<byte>)	-> i64 { return i64.view(i8.view(r[0])); }
	def read_range_i64_i16(r: Range<byte>)	-> i64 { return i64.view(i16.view(Ref<Layout_u16>.of(r).val)); }
	def read_range_i64_i32(r: Range<byte>)	-> i64 { return i64.view(i32.view(Ref<Layout_u32>.of(r).val)); }

	def read_range_i8(r: Range<byte>)	-> i8  { return i8.view(r[0]); }
	def read_range_i16(r: Range<byte>)	-> i16 { return Ref<Layout_i16>.of(r).val; }
	def read_range_i32(r: Range<byte>)	-> i32 { return Ref<Layout_i32>.of(r).val; }
	def read_range_i64(r: Range<byte>)	-> i64 { return Ref<Layout_i64>.of(r).val; }
	def read_range_u8(r: Range<byte>)	-> u8  { return r[0]; }
	def read_range_u16(r: Range<byte>)	-> u16 { return Ref<Layout_u16>.of(r).val; }
	def read_range_u32(r: Range<byte>)	-> u32 { return Ref<Layout_u32>.of(r).val; }
	def read_range_u64(r: Range<byte>)	-> u64 { return Ref<Layout_u64>.of(r).val; }
	def read_range_float(r: Range<byte>)	-> float { return Ref<Layout_float>.of(r).val; }
	def read_range_double(r: Range<byte>)	-> double { return Ref<Layout_double>.of(r).val; }
	def read_range_u128(r: Range<byte>)	-> (u64, u64) {
		var l = Ref<Layout_u128>.of(r);
		return (l.lo_val, l.hi_val);
	}
}