diff --git a/CHANGELOG.md b/CHANGELOG.md index 9fcaaa4e..a433d0d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +## 0.4.24 + - read-only support for `JLD.jl` files + - read-only support for many HDF5 files. Most test files of HDF5.jl are covered + - read Opaque bit fields + - read some other string encodings + - read big endian numbers + - read typical chunking formats + ## 0.4.23 - Support for `const` fields in mutable structs diff --git a/Project.toml b/Project.toml index 0f1f7e15..64efbf77 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "JLD2" uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819" -version = "0.4.23" +version = "0.4.24" [deps] FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" diff --git a/src/JLD2.jl b/src/JLD2.jl index 30ca263f..8a9b8911 100644 --- a/src/JLD2.jl +++ b/src/JLD2.jl @@ -39,6 +39,8 @@ InternalError() = InternalError("") # but define our own to avoid type piracy jlwrite(io, x) = Base.write(io, x) jlread(io, x) = Base.read(io, x) +jlread(io::IO, ::Type{T}, n::Integer) where {T} = T[jlread(io, T) for _=1:n] + jlsizeof(x) = Base.sizeof(x) jlunsafe_store!(p, x) = Base.unsafe_store!(p, x) jlunsafe_load(p) = Base.unsafe_load(p) @@ -49,13 +51,13 @@ include("Lookup3.jl") include("mmapio.jl") include("bufferedio.jl") include("misc.jl") +include("superblock.jl") """ RelOffset Represents an HDF5 relative offset. This differs from a file offset (used elsewhere) in -that it is relative to the superblock base address. In practice, this means that -`FILE_HEADER_LENGTH `has been subtracted. `fileoffset` and `h5offset` convert between +that it is relative to the superblock base address. `fileoffset` and `h5offset` convert between `RelOffsets` and file offsets. """ struct RelOffset @@ -64,6 +66,7 @@ end define_packed(RelOffset) Base.:(==)(x::RelOffset, y::RelOffset) = x === y Base.hash(x::RelOffset) = hash(x.offset) +Base.:(+)(x::RelOffset, y::Integer) = RelOffset(UInt64(x.offset + y)) const UNDEFINED_ADDRESS = RelOffset(0xffffffffffffffff) const NULL_REFERENCE = RelOffset(0) @@ -105,11 +108,27 @@ Supertype of all HDF5 datatypes. """ abstract type H5Datatype end +""" + SharedDatatype + +Reference to a shared datatype message (stored elsewhere in a file). +""" +struct SharedDatatype <: H5Datatype + header_offset::RelOffset +end + +""" + CommittedDatatype + +Reference to a shared datatype message (stored elsewhere in a file). +These are stored in the `_types` group and indexed. +""" struct CommittedDatatype <: H5Datatype header_offset::RelOffset index::Int end + """ ReadRepresentation{T,ODR} @@ -126,6 +145,21 @@ read as type `S`. """ struct CustomSerialization{T,S} end + +struct Filter + id::UInt16 + flags::UInt16 + name::String + client_data::Vector{UInt32} +end + +struct FilterPipeline + filters::Vector{Filter} +end + +FilterPipeline() = FilterPipeline(Filter[]) +iscompressed(fp::FilterPipeline) = !isempty(fp.filters) + """ Group(file) @@ -166,6 +200,8 @@ mutable struct Group{T} written_links) end + + """ JLDFile{T<:IO} @@ -193,6 +229,7 @@ mutable struct JLDFile{T<:IO} root_group_offset::RelOffset root_group::Group{JLDFile{T}} types_group::Group{JLDFile{T}} + base_address::UInt64 function JLDFile{T}(io::IO, path::AbstractString, writable::Bool, written::Bool, compress,#::Union{Bool,Symbol}, @@ -200,7 +237,7 @@ mutable struct JLDFile{T<:IO} f = new(io, path, writable, written, compress, mmaparrays, 1, OrderedDict{RelOffset,CommittedDatatype}(), H5Datatype[], JLDWriteSession(), Dict{String,Any}(), IdDict(), IdDict(), Dict{RelOffset,WeakRef}(), - Int64(FILE_HEADER_LENGTH + jlsizeof(Superblock)), Dict{RelOffset,GlobalHeap}(), + DATA_START, Dict{RelOffset,GlobalHeap}(), GlobalHeap(0, 0, 0, Int64[]), Dict{RelOffset,Group{JLDFile{T}}}(), UNDEFINED_ADDRESS) finalizer(jld_finalizer, f) f @@ -214,14 +251,14 @@ JLDFile(io::IO, path::AbstractString, writable::Bool, written::Bool, compress, m Converts an offset `x` relative to the superblock of file `f` to an absolute offset. """ -fileoffset(f::JLDFile, x::RelOffset) = Int64(x.offset + FILE_HEADER_LENGTH) +fileoffset(f::JLDFile, x::RelOffset) = Int64(x.offset + f.base_address) """ - h5offset(f::JLDFile, x::RelOffset) + h5offset(f::JLDFile, x::Integer) Converts an absolute file offset `x` to an offset relative to the superblock of file `f`. """ -h5offset(f::JLDFile, x::Int64) = RelOffset(x - FILE_HEADER_LENGTH) +h5offset(f::JLDFile, x::Integer) = RelOffset(UInt64(x - f.base_address)) # # File @@ -306,6 +343,7 @@ function jldopen(fname::AbstractString, wr::Bool, create::Bool, truncate::Bool, unlock(OPEN_FILES_LOCK) end if f.written + f.base_address = 512 if f isa JLDFile{MmapIO} f.root_group = Group{JLDFile{MmapIO}}(f) f.types_group = Group{JLDFile{MmapIO}}(f) @@ -321,13 +359,21 @@ function jldopen(fname::AbstractString, wr::Bool, create::Bool, truncate::Bool, end function load_file_metadata!(f) - verify_file_header(f) - - seek(f.io, FILE_HEADER_LENGTH) - superblock = jlread(f.io, Superblock) + superblock = find_superblock(f) f.end_of_data = superblock.end_of_file_address + f.base_address = superblock.base_address f.root_group_offset = superblock.root_group_object_header_address - f.root_group = load_group(f, superblock.root_group_object_header_address) + if superblock.version >= 2 + verify_file_header(f) + else + @warn "This file was not written with JLD2. Some things may not work." + if f.writable + close(f) + throw(UnsupportedVersionException("This file can not be edited by JLD2. Please open in read-only mode.")) + end + end + try + f.root_group = load_group(f, f.root_group_offset) if haskey(f.root_group.written_links, "_types") types_group_offset = f.root_group.written_links["_types"]::RelOffset @@ -340,6 +386,11 @@ function load_file_metadata!(f) else f.types_group = Group{typeof(f)}(f) end + catch e + show(e) + f.types_group = Group{typeof(f)}(f) + + end nothing end @@ -442,15 +493,7 @@ function Base.close(f::JLDFile) f.root_group_offset = res end - # Write JLD2 header - seek(io, 0) - jlwrite(io, FILE_HEADER) - - # Write superblock - seek(io, FILE_HEADER_LENGTH) - jlwrite(io, Superblock(0, FILE_HEADER_LENGTH, UNDEFINED_ADDRESS, - f.end_of_data, f.root_group_offset)) - + write_file_header(f) truncate_and_close(io, f.end_of_data) else close(io) @@ -520,7 +563,6 @@ printtoc(io::IO, f::JLDFile; numlines = typemax(Int64)) = -include("superblock.jl") include("object_headers.jl") include("groups.jl") include("dataspaces.jl") @@ -528,6 +570,7 @@ include("attributes.jl") include("datatypes.jl") include("datasets.jl") include("global_heaps.jl") +include("fractal_heaps.jl") include("data/type_defs.jl") include("data/specialcased_types.jl") @@ -548,4 +591,4 @@ if ccall(:jl_generating_output, Cint, ()) == 1 # if we're precompiling the pac include("precompile.jl") end -end # module +end \ No newline at end of file diff --git a/src/attributes.jl b/src/attributes.jl index b26fb5ae..8d9f1701 100644 --- a/src/attributes.jl +++ b/src/attributes.jl @@ -48,22 +48,167 @@ function write_attribute(io::IO, f::JLDFile, attr::WrittenAttribute, wsession::J write_data(io, f, attr.data, odr, datamode(odr), wsession) end +""" + read_attribute(io::IO, f::JLDFile) + +Read an attribute message at the current postion of the `io` object. +Supports attribute message version 1 and 2. +""" function read_attribute(io::IO, f::JLDFile) + pos = position(io) ah = jlread(io, AttributeHeader) - ah.version == 0x02 || throw(UnsupportedVersionException()) - committed = ah.flags == 1 - !committed && ah.flags != 0 && throw(UnsupportedFeatureException()) + if ah.version == 1 + committed = false + name = Symbol(jlread(io, UInt8, ah.name_size-1)) + jlread(io, UInt8) == 0 || throw(InvalidDataException()) + skip_to_aligned!(io, pos) + + datatype_end = position(io) + ah.datatype_size + datatype_class, datatype_offset = read_datatype_message(io, f, committed) + seek(io, datatype_end) + skip_to_aligned!(io, pos) + + + dataspace_end = position(io) + ah.dataspace_size + dataspace = read_dataspace_message(io) + seek(io, dataspace_end) + skip_to_aligned!(io, pos) + + ReadAttribute(name, dataspace, datatype_class, datatype_offset, position(io)) + elseif ah.version == 2 || ah.version == 3 + committed = ah.flags == 1 + !committed && ah.flags != 0 && throw(UnsupportedFeatureException()) + + if ah.version == 3 + name_charset_encoding = jlread(io, UInt8) + end - name = Symbol(jlread(io, UInt8, ah.name_size-1)) - jlread(io, UInt8) == 0 || throw(InvalidDataException()) + name = Symbol(jlread(io, UInt8, ah.name_size-1)) + jlread(io, UInt8) == 0 || throw(InvalidDataException()) - datatype_end = position(io) + ah.datatype_size - datatype_class, datatype_offset = read_datatype_message(io, f, committed) - seek(io, datatype_end) + datatype_end = position(io) + ah.datatype_size + datatype_class, datatype_offset = read_datatype_message(io, f, committed) + seek(io, datatype_end) - dataspace_end = position(io) + ah.dataspace_size - dataspace = read_dataspace_message(io) - seek(io, dataspace_end) + dataspace_end = position(io) + ah.dataspace_size + dataspace = read_dataspace_message(io) + seek(io, dataspace_end) - ReadAttribute(name, dataspace, datatype_class, datatype_offset, position(io)) + ReadAttribute(name, dataspace, datatype_class, datatype_offset, position(io)) + else + throw(UnsupportedVersionException("Unknown Attribute Header Version $(ah.version)")) + end end + +""" + load_attributes(f::JLDFile, name::AbstractString) + load_attributes(g::Group, name::AbstractString) + load_attributes(f::JLDFile, offset::RelOffset) + +Return a list of attributes attached to the dataset or group. +""" +function load_attributes(f::JLDFile, name::AbstractString) + if isempty(name) || name == "/" + load_attributes(f, f.root_group_offset) + else + load_attributes(f.root_group,name) + end +end + +function load_attributes(g::Group, name::AbstractString) + f = g.f + f.n_times_opened == 0 && throw(ArgumentError("file is closed")) + (g, name) = pathize(g, name, false) + roffset = lookup_offset(g, name) + roffset != UNDEFINED_ADDRESS || throw(ArgumentError("did not find a group or dataset named \"$name\"")) + load_attributes(f, roffset) +end + +function load_attributes(f::JLDFile, offset::RelOffset) + io = f.io + chunk_start::Int64 = fileoffset(f, offset) + seek(io, chunk_start) + # Version 1 object header have no signature and start with version + header_version = jlread(io, UInt8) + + + if header_version == 1 + seek(io, chunk_start) + cio = io + sz,_,groupflags = read_obj_start(cio) + chunk_end = position(cio) + sz + # Skip to nearest 8byte aligned position + skip_to_aligned!(cio, fileoffset(f, offset)) + else + header_version = 2 + seek(io, chunk_start) + cio = begin_checksum_read(io) + sz,_,groupflags = read_obj_start(cio) + chunk_end = position(cio) + sz #- 4 + end + + # Messages + chunks = [(; chunk_start, chunk_end)] + chunk_number = 0 + + attrs = Any[] + while !isempty(chunks) + chunk = popfirst!(chunks) + chunk_start = chunk.chunk_start + chunk_end = chunk.chunk_end + + if chunk_number > 0 # Don't do this the first time around + seek(io, chunk_start) + if header_version == 2 + chunk_end -= 4 + cio = begin_checksum_read(io) + jlread(cio, UInt32) == OBJECT_HEADER_CONTINUATION_SIGNATURE || throw(InvalidDataException()) + end + end + chunk_number += 1 + + while position(cio) <= chunk_end-4 + if header_version == 1 + # Message start 8byte aligned relative to object start + skip_to_aligned!(cio, chunk_start) + # Version 1 header message is padded + msg = HeaderMessage(jlread(cio, UInt16), jlread(cio, UInt16), jlread(cio, UInt8)) + skip(cio, 3) + else # header_version == 2 + msg = jlread(cio, HeaderMessage) + (groupflags & 4) == 4 && skip(cio, 2) + end + endpos = position(cio) + msg.size + + if msg.msg_type == HM_ATTRIBUTE + push!(attrs, read_attribute(cio, f)) + elseif msg.msg_type == HM_OBJECT_HEADER_CONTINUATION + continuation_offset = fileoffset(f, jlread(cio, RelOffset)) + continuation_length = jlread(cio, Length) + push!(chunks, (; chunk_start = continuation_offset, + chunk_end = continuation_offset + continuation_length)) + elseif (msg.flags & 2^3) != 0 + throw(UnsupportedFeatureException()) + end + seek(cio, endpos) + end + seek(cio, chunk_end) + + if header_version == 2 + # Checksum + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException("Invalid Checksum")) + end + end + + map(attrs) do attr + attr_data = + try + read_attr_data(f, attr) + catch e + rethrow(e) + nothing + end + + attr.name => attr_data + end +end \ No newline at end of file diff --git a/src/backwards_compatibility.jl b/src/backwards_compatibility.jl index 254b58f8..e6943666 100644 --- a/src/backwards_compatibility.jl +++ b/src/backwards_compatibility.jl @@ -15,10 +15,167 @@ constructrr(::JLDFile, ::Type{T}, dt::VariableLengthDatatype, ::Vector{ReadAttri # The following definition is needed to correctly load Strings written # with JLD2 with versions v0.1.12 - v0.3.1 function read_array(f::JLDFile, dataspace::ReadDataspace, - rr::FixedLengthString{String}, data_length::Int, - filter_id::UInt16, header_offset::RelOffset, + rr::FixedLengthString{String}, layout::DataLayout, + filters::FilterPipeline, header_offset::RelOffset, attributes::Union{Vector{ReadAttribute},Nothing}) rrv = ReadRepresentation{UInt8,odr(UInt8)}() - v = read_array(f, dataspace, rrv, data_length, filter_id, NULL_REFERENCE, attributes) + v = read_array(f, dataspace, rrv, layout, filters, NULL_REFERENCE, attributes) String(v) +end + + +function julia_type(s::AbstractString) + s = replace(s, r"ASCIIString|UTF8String|ByteString" => "String") + if occursin("Base.UTF16String", s) + error("file contains Base.UTF16String, must be converted and re-saved with JLD 0.9 or less") + end + _julia_type(s) +end + + +function constructrr(f::JLDFile, str::String, dt::CompoundDatatype, attrs::Vector{ReadAttribute}) + jl_type = julia_type(str) + # for some reason JLD attaches a '_' at the end of member field names + strnames = [string(name)[1:end-1] for name in dt.names] + dt.names .= Symbol.(strnames) + constructrr(f, jl_type, dt, attrs) +end + + +struct UnsupportedType; end +struct UnconvertedType; end + +const _where_macrocall = Symbol("@where") +function expand_where_macro(e::Expr) + e.head = :where + popfirst!(e.args) + popfirst!(e.args) # source location + return true +end + +is_valid_type_ex(s::Symbol) = true +is_valid_type_ex(s::QuoteNode) = true +is_valid_type_ex(s) = isbitstype(typeof(s)) +function is_valid_type_ex(e::Expr) + if e.head === :curly || e.head == :tuple || e.head == :. + return all(is_valid_type_ex, e.args) + elseif e.head === :where + return is_valid_type_ex(e.args[1]) + elseif e.head === :let && length(e.args) == 2 + return is_valid_type_ex(e.args[2]) && + is_valid_type_ex(e.args[1].args[2]) + elseif e.head == :call + f = e.args[1] + if f isa Expr + if f.head === :core + f = f.args[1] + return f === :Union || f === :TypeVar || f === :UnionAll + end + elseif f isa Symbol + return f === :Union || f === :TypeVar || f === :symbol + end + end + return false +end + +const typemap_Core = Dict( + :Uint8 => :UInt8, + :Uint16 => :Uint16, + :Uint32 => :UInt32, + :Uint64 => :UInt64, + :Void => Symbol(Nothing) +) + +const _typedict = Dict{String,Type}() + +function fixtypes(typ) + whereall = [] + typ = fixtypes(typ, whereall) + while !isempty(whereall) + var = pop!(whereall) + typ = Expr(:let, var, Expr(:call, Expr(:core, :UnionAll), var.args[1], typ)) + end + return typ +end +fixtypes(typ, whereall) = typ +function fixtypes(typ::Expr, whereall::Vector{Any}) + if typ.head === :macrocall && typ.args[1] === _where_macrocall + expand_where_macro(typ) # @where => TypeVar format forwards compatibility + end + if typ.head === :. + if length(typ.args) == 2 && typ.args[1] === :Core + arg = typ.args[2].value + return Expr(:., :Core, QuoteNode(get(typemap_Core, arg, arg))) + else + return typ + end + elseif typ == :(Core.Type{TypeVar(:T,Union(Core.Any,Core.Undef))}) || typ == :(Core.Type{TypeVar(:T)}) + # Work around https://github.com/JuliaLang/julia/issues/8226 and the removal of Top + return :(Core.Type) + end + + for i = 1:length(typ.args) + typ.args[i] = fixtypes(typ.args[i], whereall) + end + + if (typ.head === :call && !isempty(typ.args) && + typ.args[1] === :TypeVar) # TypeVar => where format backwards compatibility + tv = gensym() + push!(whereall, Expr(:(=), tv, typ)) + return tv + end + + if (typ.head === :call && !isempty(typ.args) && + typ.args[1] === :Union) + typ = Expr(:curly, typ.args...) + end + + if typ.head === :tuple + if !any(x->isa(x,QuoteNode) || isbits(x), typ.args) + # guess that we have a tuple type represented as a tuple + typ = Expr(:curly, :Tuple, typ.args...) + end + end + + if typ.head === :curly + # assume literal TypeVar should work like `T{<:S}` + while !isempty(whereall) + var = pop!(whereall) + typ = Expr(:let, var, Expr(:call, Expr(:core, :UnionAll), var.args[1], typ)) + end + end + return typ +end + +function _julia_type(s::AbstractString) + typ = get(_typedict, s, UnconvertedType) + if typ == UnconvertedType + sp = Meta.parse(s, raise=false) + if (isa(sp, Expr) && (sp.head == :error || sp.head == :continue || sp.head == :incomplete)) + println("error parsing type string ", s) + eval(sp) + end + typ = julia_type(fixtypes(sp)) + if typ != UnsupportedType + _typedict[s] = typ + end + end + if typ == UnconvertedType || typ == UnsupportedType + return UnknownType(s) + else + return typ + end +end + +function julia_type(e::Union{Symbol, Expr}) + if is_valid_type_ex(e) + try # `try` needed to catch undefined symbols + # `e` should be fully qualified, and thus reachable from Main + typ = Core.eval(Main, e) + typ == Type && return Type + isa(typ, Type) && return typ + catch + end + end + return UnsupportedType end \ No newline at end of file diff --git a/src/compression.jl b/src/compression.jl index 8b474dfb..58e458cf 100644 --- a/src/compression.jl +++ b/src/compression.jl @@ -1,6 +1,56 @@ +function jlread(io, ::Type{FilterPipeline}) + version = jlread(io, UInt8) + nfilters = jlread(io, UInt8) + if version == 1 + skip(io, 6) + filters = map(1:nfilters) do _ + id = jlread(io, UInt16) + name_length = jlread(io, UInt16) + flags = jlread(io, UInt16) + nclient_vals = jlread(io, UInt16) + if iszero(name_length) + name = "" + else + name = read_bytestring(io) + skip(io, 8-mod1(sizeof(name), 8)-1) + end + client_data = jlread(io, UInt32, nclient_vals) + isodd(nclient_vals) && skip(io, 4) + Filter(id, flags, name, client_data) + end + return FilterPipeline(filters) + elseif version == 2 + filters = map(1:nfilters) do _ + id = jlread(io, UInt16) + if id > 255 + name_length = jlread(io, UInt16) + flags = jlread(io, UInt16) + nclient_vals = jlread(io, UInt16) + if iszero(name_length) + name = "" + else + name = read_bytestring(io) + skip(io, 8-mod1(sizeof(name), 8)-1) + end + else + name = "" + flags = jlread(io, UInt16) + nclient_vals = jlread(io, UInt16) + end + client_data = jlread(io, UInt32, nclient_vals) + Filter(id, flags, name, client_data) + end + return FilterPipeline(filters) + else + throw(UnsupportedVersionException("Filter Pipeline Message version $version is not implemented")) + end + + +end const COMPRESSOR_TO_ID = Dict( :ZlibCompressor => UInt16(1), + :ShuffleFilter => UInt16(2), :Bzip2Compressor => UInt16(307), #:BloscCompressor => UInt16(32001), :LZ4FrameCompressor => UInt16(32004), @@ -9,6 +59,7 @@ const COMPRESSOR_TO_ID = Dict( # For loading need filter_ids as keys const ID_TO_DECOMPRESSOR = Dict( UInt16(1) => (:CodecZlib, :ZlibCompressor, :ZlibDecompressor, ""), + UInt16(2) => (:JLD2, :ShuffleFilter, :ShuffleFilter, ""), UInt16(307) => (:CodecBzip2, :Bzip2Compressor, :Bzip2Decompressor, "BZIP2"), #UInt16(32001) => (:Blosc, :BloscCompressor, :BloscDecompressor, "BLOSC"), UInt16(32004) => (:CodecLz4, :LZ4FrameCompressor, :LZ4FrameDecompressor, "LZ4"), @@ -111,6 +162,16 @@ function get_decompressor(filter_id::UInt16) invoke_again, m = checked_import(modname) return invoke_again, @eval $m.$decompressorname() end +function get_decompressor(filters::FilterPipeline) + decompressors = Any[] + invoke_again = false + for filter in filters.filters + modname, compressorname, decompressorname, = ID_TO_DECOMPRESSOR[filter.id] + invoke_again, m = checked_import(modname) + push!(decompressors, @eval $m.$decompressorname()) + end + return invoke_again, decompressors +end pipeline_message_size(filter_id) = 4 + 12 + (filter_id > 255)*(2 + length(ID_TO_DECOMPRESSOR[filter_id][4])) @@ -188,23 +249,53 @@ function write_compressed_data(cio, f, data, odr, wsession, filter_id, compresso jlwrite(f.io, deflated) end +function decompress!(inptr::Ptr, data_length, element_size, n, decompressor::TranscodingStreams.Codec) + TranscodingStreams.initialize(decompressor) + data = transcode(decompressor, unsafe_wrap(Array, Ptr{UInt8}(inptr), data_length))::Array{UInt8, 1} + TranscodingStreams.finalize(decompressor) + return data +end +struct ShuffleFilter end + +function decompress!(data::Vector{UInt8}, data_length, element_size, num_elements, decompressor::ShuffleFilter) + # Start with all least significant bytes, then work your way up + # I'll leave this for somenone else to make performant + @assert data_length == length(data) + @assert data_length % element_size == 0 + @assert data_length÷element_size == num_elements + data_new = similar(data) + for n = eachindex(data_new) + j = 1 + (n-1)*num_elements + i = mod1(j , data_length) + (j-1)÷data_length + data_new[n] = data[i] + end + return data_new +end +function decompress!(io::IOStream, data_length, element_size, n, decompressor) + read!(TranscodingStream(decompressor, io), Vector{UInt8}(undef, element_size*n)) +end function read_compressed_array!(v::Array{T}, f::JLDFile{MmapIO}, rr::ReadRepresentation{T,RR}, - data_length::Int, - filter_id + data_length::Integer, + filters ) where {T,RR} - invoke_again, decompressor = get_decompressor(filter_id) + invoke_again, decompressors = get_decompressor(filters) if invoke_again - return Base.invokelatest(read_compressed_array!, v, f, rr, data_length, filter_id)::typeof(v) + return Base.invokelatest(read_compressed_array!, v, f, rr, data_length, filters)::typeof(v) end io = f.io inptr = io.curptr - TranscodingStreams.initialize(decompressor) - data = transcode(decompressor, unsafe_wrap(Array, Ptr{UInt8}(inptr), data_length))::Array{UInt8, 1} - TranscodingStreams.finalize(decompressor) + element_size = odr_sizeof(RR) + n = length(v) + data = decompress!(inptr, data_length, element_size, n, decompressors[end]) + if length(decompressors) > 1 + for decompressor in decompressors[end-1:-1:1] + data = decompress!(data, length(data), element_size, n, decompressor) + end + end @simd for i = 1:length(v) dataptr = Ptr{Cvoid}(pointer(data, odr_sizeof(RR)*(i-1)+1)) if !jlconvert_canbeuninitialized(rr) || jlconvert_isinitialized(rr, dataptr) @@ -217,17 +308,24 @@ end function read_compressed_array!(v::Array{T}, f::JLDFile{IOStream}, rr::ReadRepresentation{T,RR}, - data_length::Int, - filter_id, + data_length::Integer, + filters, ) where {T,RR} - invoke_again, decompressor = get_decompressor(filter_id) + invoke_again, decompressors = get_decompressor(filters) if invoke_again - return Base.invokelatest(read_compressed_array!, v, f, rr, data_length, filter_id)::typeof(v) + return Base.invokelatest(read_compressed_array!, v, f, rr, data_length, filters)::typeof(v) end + io = f.io data_offset = position(io) n = length(v) - data = read!(TranscodingStream(decompressor, io), Vector{UInt8}(undef, odr_sizeof(RR)*n)) + element_size = odr_sizeof(RR) + data = decompress!(io, data_length, element_size, n, decompressors[end]) + if length(decompressors) > 1 + for decompressor in decompressors[end-1:-1:1] + data = decompress!(data, length(data), element_size, n, decompressor) + end + end @simd for i = 1:n dataptr = Ptr{Cvoid}(pointer(data, odr_sizeof(RR)*(i-1)+1)) if !jlconvert_canbeuninitialized(rr) || jlconvert_isinitialized(rr, dataptr) diff --git a/src/data/number_types.jl b/src/data/number_types.jl index 02585c91..702bdf6c 100644 --- a/src/data/number_types.jl +++ b/src/data/number_types.jl @@ -19,23 +19,47 @@ for T in Base.uniontypes(UnsignedTypes) FixedPointDatatype($(T.parameters[1].size), false) end +struct BENumber{T} + x::T +end + +jlconvert(::ReadRepresentation{T,BENumber{T}}, ::JLDFile, ptr::Ptr, ::RelOffset) where {T} = + bswap(jlunsafe_load(convert(Ptr{T}, ptr))) function jltype(f::JLDFile, dt::FixedPointDatatype) - signed = dt.bitfield1 == 0x08 ? true : dt.bitfield1 == 0x00 ? false : throw(UnsupportedFeatureException()) + signed = Bool(dt.bitfield1 >> 3 & 0b1) + endianness = dt.bitfield1 & 0b1 # 0 → little endian, 1 → big endian + #endianness == 0 || throw(UnsupportedFeatureException("load big endian numbers is not implemented.")) ((dt.bitfield2 == 0x00) & (dt.bitfield3 == 0x00) & (dt.bitoffset == 0) & (dt.bitprecision == dt.size*8)) || throw(UnsupportedFeatureException()) - if dt.size == 8 - return signed ? ReadRepresentation{Int64,Int64}() : ReadRepresentation{UInt64,UInt64}() - elseif dt.size == 1 - return signed ? ReadRepresentation{Int8,Int8}() : ReadRepresentation{UInt8,UInt8}() - elseif dt.size == 4 - return signed ? ReadRepresentation{Int32,Int32}() : ReadRepresentation{UInt32,UInt32}() - elseif dt.size == 2 - return signed ? ReadRepresentation{Int16,Int16}() : ReadRepresentation{UInt16,UInt16}() - elseif dt.size == 16 - return signed ? ReadRepresentation{Int128,Int128}() : ReadRepresentation{UInt128,UInt128}() + if endianness == 0 + if dt.size == 8 + return signed ? ReadRepresentation{Int64,Int64}() : ReadRepresentation{UInt64,UInt64}() + elseif dt.size == 1 + return signed ? ReadRepresentation{Int8,Int8}() : ReadRepresentation{UInt8,UInt8}() + elseif dt.size == 4 + return signed ? ReadRepresentation{Int32,Int32}() : ReadRepresentation{UInt32,UInt32}() + elseif dt.size == 2 + return signed ? ReadRepresentation{Int16,Int16}() : ReadRepresentation{UInt16,UInt16}() + elseif dt.size == 16 + return signed ? ReadRepresentation{Int128,Int128}() : ReadRepresentation{UInt128,UInt128}() + else + throw(UnsupportedFeatureException()) + end else - throw(UnsupportedFeatureException()) + if dt.size == 8 + return signed ? ReadRepresentation{Int64,BENumber{Int64}}() : ReadRepresentation{UInt64,BENumber{UInt64}}() + elseif dt.size == 1 + return signed ? ReadRepresentation{Int8,BENumber{Int8}}() : ReadRepresentation{UInt8,BENumber{UInt8}}() + elseif dt.size == 4 + return signed ? ReadRepresentation{Int32,BENumber{Int32}}() : ReadRepresentation{UInt32,BENumber{UInt32}}() + elseif dt.size == 2 + return signed ? ReadRepresentation{Int16,BENumber{Int16}}() : ReadRepresentation{UInt16,BENumber{UInt16}}() + elseif dt.size == 16 + return signed ? ReadRepresentation{Int128,BENumber{Int128}}() : ReadRepresentation{UInt128,BENumber{UInt128}}() + else + throw(UnsupportedFeatureException()) + end end end @@ -50,6 +74,13 @@ h5fieldtype(::JLDFile, ::Type{Float32}, ::Type{Float32}, ::Initialized) = h5fieldtype(::JLDFile, ::Type{Float64}, ::Type{Float64}, ::Initialized) = FloatingPointDatatype(DT_FLOATING_POINT, 0x20, 0x3f, 0x00, 8, 0, 64, 52, 11, 0, 52, 0x000003ff) +h5fieldtype(::JLDFile, ::Type{BENumber{Float16}}, ::Type{Float16}, ::Initialized) = + FloatingPointDatatype(DT_FLOATING_POINT, 0x21, 0x0f, 0x00, 2, 0, 16, 10, 5, 0, 10, 0x0000000f) +h5fieldtype(::JLDFile, ::Type{BENumber{Float32}}, ::Type{Float32}, ::Initialized) = + FloatingPointDatatype(DT_FLOATING_POINT, 0x21, 0x1f, 0x00, 4, 0, 32, 23, 8, 0, 23, 0x0000007f) +h5fieldtype(::JLDFile, ::Type{BENumber{Float64}}, ::Type{Float64}, ::Initialized) = + FloatingPointDatatype(DT_FLOATING_POINT, 0x21, 0x3f, 0x00, 8, 0, 64, 52, 11, 0, 52, 0x000003ff) + function jltype(f::JLDFile, dt::FloatingPointDatatype) if dt == h5fieldtype(f, Float64, Float64, Val{true}) return ReadRepresentation{Float64,Float64}() @@ -57,6 +88,12 @@ function jltype(f::JLDFile, dt::FloatingPointDatatype) return ReadRepresentation{Float32,Float32}() elseif dt == h5fieldtype(f, Float16, Float16, Val{true}) return ReadRepresentation{Float16,Float16}() + elseif dt == h5fieldtype(f, BENumber{Float64}, Float64, Val{true}) + return ReadRepresentation{Float32,BENumber{Float32}}() + elseif dt == h5fieldtype(f, BENumber{Float32}, Float32, Val{true}) + return ReadRepresentation{Float32,BENumber{Float32}}() + elseif dt == h5fieldtype(f, BENumber{Float16}, Float16, Val{true}) + return ReadRepresentation{Float32,BENumber{Float32}}() else throw(UnsupportedFeatureException()) end diff --git a/src/data/reconstructing_datatypes.jl b/src/data/reconstructing_datatypes.jl index 9165547e..83ad2930 100644 --- a/src/data/reconstructing_datatypes.jl +++ b/src/data/reconstructing_datatypes.jl @@ -21,12 +21,12 @@ end # H5Datatype. We handle committed datatypes here, and other datatypes below. function jltype(f::JLDFile, cdt::CommittedDatatype) haskey(f.h5jltype, cdt) && return f.h5jltype[cdt]::ReadRepresentation - dt, attrs = read_committed_datatype(f, cdt) + dt, attrs = read_shared_datatype(f, cdt) julia_type_attr = nothing written_type_attr = nothing for attr in attrs - if attr.name == :julia_type + if attr.name == :julia_type || attr.name == Symbol("julia type") julia_type_attr = attr elseif attr.name == :written_type written_type_attr = attr @@ -78,6 +78,18 @@ function jltype(f::JLDFile, cdt::CommittedDatatype) f.h5jltype[cdt] = rr end + +# jltype is the inverse of h5type, providing a ReadRepresentation for an +# H5Datatype. We handle shared datatypes here: ones that were not "committed" by JLD2. +function jltype(f::JLDFile, sdt::SharedDatatype) + haskey(f.h5jltype, sdt) && return f.h5jltype[sdt]::ReadRepresentation + dt, attrs = read_shared_datatype(f, sdt) + rr = jltype(f, dt) + f.h5jltype[sdt] = rr +end + + + # Constructs a ReadRepresentation for a given opaque (bitstype) type function constructrr(::JLDFile, T::DataType, dt::BasicDatatype, attrs::Vector{ReadAttribute}) dt.class == DT_OPAQUE || throw(UnsupportedFeatureException()) @@ -227,7 +239,7 @@ function constructrr(f::JLDFile, T::DataType, dt::CompoundDatatype, end end end - return (ReadRepresentation{T,OnDiskRepresentation{offsets, Tuple{types...}, Tuple{odrs...}}()}(), false) + return (ReadRepresentation{T,OnDiskRepresentation{offsets, Tuple{types...}, Tuple{odrs...}, offsets[end]+odr_sizeof(odrs[end])}()}(), false) end end @@ -477,7 +489,7 @@ function reconstruct_odr(f::JLDFile, dt::CompoundDatatype, end types[i], h5types[i] = typeof(dtrr).parameters end - return OnDiskRepresentation{(dt.offsets...,), Tuple{types...}, Tuple{h5types...}}() + return OnDiskRepresentation{(dt.offsets...,), Tuple{types...}, Tuple{h5types...},dt.size}() end # Reconstruct type that is a "lost cause": either we were not able to resolve @@ -555,7 +567,7 @@ jlconvert(::ReadRepresentation{Core.TypeofBottom,nothing}, f::JLDFile, ptr::Ptr, end end end - + push!(args, (:obj)) return blk end diff --git a/src/data/specialcased_types.jl b/src/data/specialcased_types.jl index 2d090a9f..5efcc4cc 100644 --- a/src/data/specialcased_types.jl +++ b/src/data/specialcased_types.jl @@ -1,3 +1,16 @@ +## Opaque Data +struct OpaqueData{N} + data::Vector{UInt8} + OpaqueData(data) = new{length(data)}(data) +end +odr_sizeof(x::Type{OpaqueData{N}}) where {N} = UInt32(N) +function jlconvert(rr::ReadRepresentation{OpaqueData{N}, Vector{UInt8}}, ::JLDFile, ptr::Ptr, ::RelOffset) where N + data = Vector{UInt8}(undef, N) + unsafe_copyto!(pointer(data), convert(Ptr{UInt8}, ptr), N) + OpaqueData(data) +end + + ## Strings const H5TYPE_VLEN_UTF8 = VariableLengthDatatype(DT_VARIABLE_LENGTH, 0x11, 0x01, 0x00, @@ -24,16 +37,48 @@ h5type(f::JLDFile, writeas::Type{String}, x) = odr(::Type{String}) = fieldodr(String, true) objodr(x::String) = FixedLengthString{String}(jlsizeof(x)) +struct NullTerminated end +struct SpacePadded end +struct AsciiString{TERM} + length::Int +end + + +struct FixedLengthAsciiString{TERM, N} end + function jltype(f::JLDFile, dt::BasicDatatype) - if dt.class == DT_STRING + if dt.class >> 4 == 1 + if dt.class << 4 == DT_REFERENCE << 4 + return ReadRepresentation{Any,RelOffset}() + elseif dt.class << 4 == DT_STRING << 4 + if dt.bitfield1 == 0x00 && dt.bitfield2 == 0x00 && dt.bitfield3 == 0x00 + #return AsciiString{NullTerminated}(dt.size) + return ReadRepresentation{String, FixedLengthAsciiString{NullTerminated, dt.size}}() + elseif dt.bitfield1 == 0x10 && dt.bitfield2 == 0x00 && dt.bitfield3 == 0x00 + return FixedLengthString{String}(dt.size) + elseif dt.bitfield1 == 0x02 && dt.bitfield2 == 0x00 && dt.bitfield3 == 0x00 + return ReadRepresentation{String, FixedLengthAsciiString{SpacePadded, dt.size}}() + else + throw(UnsupportedFeatureException("Encountered an unsupported string type. $dt")) + end + elseif dt.class << 4 == DT_OPAQUE << 4 + return ReadRepresentation{OpaqueData{Int(dt.size)},Vector{UInt8}}() + + else + throw(UnsupportedFeatureException("Encountered an unsupported type.")) + end + end + if dt.class << 4 == DT_STRING << 4 if (dt.bitfield1 == 0x01 || dt.bitfield1 == 0x11) && dt.bitfield2 == 0x00 && dt.bitfield3 == 0x00 return FixedLengthString{String}(dt.size) + elseif dt.bitfield1 == 0x10 && dt.bitfield2 == 0x00 && dt.bitfield3 == 0x00 + return FixedLengthString{String}(dt.size) else - throw(UnsupportedFeatureException()) + throw(UnsupportedFeatureException("Encountered an unsupported string type.")) end - elseif dt.class == DT_OPAQUE - error("attempted to read a bare (non-committed) opaque datatype") - elseif dt.class == DT_REFERENCE + elseif dt.class << 4 == DT_OPAQUE << 4 + return ReadRepresentation{OpaqueData{Int(dt.size)},Vector{UInt8}}() + elseif dt.class << 4 == DT_REFERENCE << 4 return ReadRepresentation{Any,RelOffset}() else throw(UnsupportedFeatureException()) @@ -41,13 +86,24 @@ function jltype(f::JLDFile, dt::BasicDatatype) end function jltype(f::JLDFile, dt::VariableLengthDatatype) - if dt == H5TYPE_VLEN_UTF8 + if dt == H5TYPE_VLEN_UTF8 + # this is the fully supported JLD2 string return ReadRepresentation{String,Vlen{String}}() - else - throw(UnsupportedFeatureException()) + elseif dt.bitfield1 & 0x1 == 0x1 + # it's some kind of string. Let's try + return ReadRepresentation{String,Vlen{String}}() + else#if dt.bitfield1 & 0x1 == 0x0 # it's a sequence + rr = jltype(f, dt.basetype) + T = typeof(rr).parameters[1] + odr = typeof(rr).parameters[2] + return ReadRepresentation{Vector{T}, Vlen{odr}}() end end +jlconvert(::ReadRepresentation{Vector{T},Vlen{ODR}}, f::JLDFile, ptr::Ptr, ::RelOffset) where {T, ODR} = + jlconvert(ReadRepresentation{T,Vlen{ODR}}(), f, ptr, UNDEFINED_ADDRESS) + + function h5convert!(out::Pointers, fls::FixedLengthString, f::JLDFile, x, ::JLDWriteSession) fls.length == jlsizeof(x) || throw(InvalidDataException()) (unsafe_copyto!(convert(Ptr{UInt8}, out), pointer(x), fls.length); nothing) @@ -63,6 +119,30 @@ function jlconvert(rr::FixedLengthString{String}, ::JLDFile, ptr::Ptr, ::RelOffs String(data) end +# Ascii String +function jlconvert(rr::AsciiString{NullTerminated}, ::JLDFile, ptr::Ptr, ::RelOffset) + data = Vector{UInt8}(undef, rr.length) + unsafe_copyto!(pointer(data), convert(Ptr{UInt8}, ptr), rr.length) + String(data[1:end-1]) +end +function jlconvert(rr::ReadRepresentation{String, FixedLengthAsciiString{NullTerminated,N}}, ::JLDFile, ptr::Ptr, ::RelOffset) where {N} + data = Vector{UInt8}(undef, N) + unsafe_copyto!(pointer(data), convert(Ptr{UInt8}, ptr), N) + String(data) +end + +function jlconvert(rr::ReadRepresentation{String, FixedLengthAsciiString{SpacePadded,N}}, ::JLDFile, ptr::Ptr, ::RelOffset) where {N} + data = Vector{UInt8}(undef, N) + unsafe_copyto!(pointer(data), convert(Ptr{UInt8}, ptr), N) + rstrip(String(data)) +end +odr_sizeof(x::AsciiString) = x.length +odr_sizeof(x::Type{FixedLengthAsciiString{TERM, N}}) where {TERM, N} = UInt32(N)#::Int + + + + + # Used only for custom serialization constructrr(::JLDFile, ::Type{String}, dt::VariableLengthDatatype{FixedPointDatatype}, ::Vector{ReadAttribute}) = dt == H5TYPE_VLEN_UTF8 ? @@ -206,4 +286,4 @@ function writeas(NT::Type{NTuple{N,T}}) where {N,T} end wconvert(::Type{Vector{T}}, x::NTuple{N,T}) where {N,T} = collect(x) -rconvert(::Type{NTuple{N,T}}, x::Vector{T}) where {N,T} = NTuple{N,T}(x) +rconvert(::Type{NTuple{N,T}}, x::Vector{T}) where {N,T} = NTuple{N,T}(x) \ No newline at end of file diff --git a/src/data/type_defs.jl b/src/data/type_defs.jl index eebce017..c88c2ed3 100644 --- a/src/data/type_defs.jl +++ b/src/data/type_defs.jl @@ -2,7 +2,7 @@ const Initialized = Union{Type{Val{true}}, Type{Val{false}}} const Pointers = Union{Ptr{Cvoid}, IndirectPointer} -struct OnDiskRepresentation{Offsets,JLTypes,H5Types} end +struct OnDiskRepresentation{Offsets,JLTypes,H5Types, Size} end odr_sizeof(::Nothing) = 0 @Base.pure odr_sizeof(x::DataType) = Int(x.size) diff --git a/src/data/writing_datatypes.jl b/src/data/writing_datatypes.jl index 6c28e6e5..bc0bf1e8 100644 --- a/src/data/writing_datatypes.jl +++ b/src/data/writing_datatypes.jl @@ -1,5 +1,5 @@ # Initial ODR for DataType -const DataTypeODR = OnDiskRepresentation{(0, odr_sizeof(Vlen{String})),Tuple{String,Vector{Any}},Tuple{Vlen{String},Vlen{RelOffset}}} +const DataTypeODR = OnDiskRepresentation{(0, odr_sizeof(Vlen{String})),Tuple{String,Vector{Any}},Tuple{Vlen{String},Vlen{RelOffset}}, odr_sizeof(Vlen{String})+odr_sizeof(Vlen{RelOffset})} const NULL_COMMITTED_DATATYPE = CommittedDatatype(RelOffset(0), 0) @@ -35,8 +35,9 @@ function hasdata(T::DataType, encounteredtypes=DataType[]) end # Gets the size of an on-disk representation -Base.@pure function odr_sizeof(::OnDiskRepresentation{Offsets,JLTypes,H5Types}) where {Offsets,JLTypes,H5Types} - Offsets[end]+odr_sizeof(H5Types.parameters[end]) +function odr_sizeof(::OnDiskRepresentation{Offsets,JLTypes,H5Types,Size}) where {Offsets,JLTypes,H5Types,Size} + #Offsets[end]+odr_sizeof(H5Types.parameters[end]) + Size end # Determines whether a type will have the same layout on disk as in memory @@ -225,8 +226,8 @@ h5convert!(out::Pointers, ::Type{T}, ::JLDFile, x, ::JLDWriteSession) where {T} # We pack types that have padding using a staged h5convert! method @generated function h5convert!(out::Pointers, - ::OnDiskRepresentation{Offsets,Types,H5Types}, - file::JLDFile, x, wsession::JLDWriteSession) where {Offsets,Types,H5Types} + ::OnDiskRepresentation{Offsets,Types,H5Types,Size}, + file::JLDFile, x, wsession::JLDWriteSession) where {Offsets,Types,H5Types,Size} T = x types = Types.parameters members = H5Types.parameters @@ -445,7 +446,8 @@ const H5TYPE_UNION = CompoundDatatype( const UnionTypeODR = OnDiskRepresentation{ (0, odr_sizeof(Vlen{String}), odr_sizeof(Vlen{String})+odr_sizeof(Vlen{RelOffset})), Tuple{String, Vector{Any}, Vector{Any}}, - Tuple{Vlen{String}, Vlen{RelOffset}, Vlen{RelOffset}}} + Tuple{Vlen{String}, Vlen{RelOffset}, Vlen{RelOffset}}, + odr_sizeof(Vlen{String})+2*odr_sizeof(Vlen{RelOffset})} function h5fieldtype(f::JLDFile, ::Type{T}, readas::Type{S}, ::Initialized) where {T<:Union,S<:Union} @lookup_committed f Union @@ -508,7 +510,7 @@ end ## UnionAll -const UnionAllODR = OnDiskRepresentation{(0, 8),Tuple{TypeVar,Any},Tuple{JLD2.RelOffset,JLD2.RelOffset}} +const UnionAllODR = OnDiskRepresentation{(0, 8),Tuple{TypeVar,Any},Tuple{RelOffset,RelOffset}, 16} # This needs its own h5convert! method, since otherwise we will attempt to specialize the # generic h5convert! method for the specific UnionAll type rather than for UnionAll @@ -602,7 +604,7 @@ function odr(::Type{T}) where T offset += odr_sizeof(fodr) end - OnDiskRepresentation{(offsets...,), Tuple{T.types...}, Tuple{odrs...}}() + OnDiskRepresentation{(offsets...,), Tuple{T.types...}, Tuple{odrs...}, offset}() end abstract type DataMode end @@ -613,8 +615,9 @@ datamode(::Type{CustomSerialization{WrittenAs,ODR}}) where {WrittenAs,ODR} = dat datamode(::Union{Type{<:Vlen},Type{RelOffset}}) = HasReferences() datamode(::DataType) = ReferenceFree() datamode(::FixedLengthString) = ReferenceFree() +datamode(::AsciiString) = ReferenceFree() datamode(::Nothing) = ReferenceFree() -@generated function datamode(odr::OnDiskRepresentation{Offsets,JLTypes,H5Types} where {Offsets,JLTypes}) where H5Types +@generated function datamode(odr::OnDiskRepresentation{Offsets,JLTypes,H5Types,Size} where {Offsets,JLTypes,Size}) where H5Types for ty in H5Types.parameters datamode(ty) == HasReferences() && return HasReferences() end diff --git a/src/datasets.jl b/src/datasets.jl index c6c7def4..6f145fee 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -13,87 +13,121 @@ function load_dataset(f::JLDFile, offset::RelOffset) val = f.jloffset[offset].value val !== nothing && return val end + if isgroup(f, offset) + return let loaded_groups = f.loaded_groups + get!(()->load_group(f, offset), loaded_groups, offset) + end + end io = f.io - seek(io, fileoffset(f, offset)) - cio = begin_checksum_read(io) - sz = read_obj_start(cio) - pmax = position(cio) + sz + chunk_start::Int64 = fileoffset(f, offset) + seek(io, chunk_start) + # Version 1 object header have no signature and start with version + header_version = jlread(io, UInt8) + + + if header_version == 1 + seek(io, chunk_start) + cio = io + sz,_,groupflags = read_obj_start(cio) + chunk_end = position(cio) + sz + # Skip to nearest 8byte aligned position + skip_to_aligned!(cio, fileoffset(f, offset)) + else + header_version = 2 + seek(io, chunk_start) + cio = begin_checksum_read(io) + sz,_,groupflags = read_obj_start(cio) + chunk_end = position(cio) + sz #- 4 + end # Messages + chunks = [(; chunk_start, chunk_end)] + chunk_number = 0 + dataspace = ReadDataspace() attrs = EMPTY_READ_ATTRIBUTES datatype_class::UInt8 = 0 datatype_offset::Int64 = 0 - data_offset::Int64 = 0 - data_length::Int = -1 - chunked_storage::Bool = false - filter_id::UInt16 = 0 - while position(cio) <= pmax-4 - msg = jlread(cio, HeaderMessage) - endpos = position(cio) + msg.size - if msg.msg_type == HM_DATASPACE - dataspace = read_dataspace_message(cio) - elseif msg.msg_type == HM_DATATYPE - datatype_class, datatype_offset = read_datatype_message(cio, f, (msg.flags & 2) == 2) - elseif msg.msg_type == HM_FILL_VALUE - (jlread(cio, UInt8) == 3 && jlread(cio, UInt8) == 0x09) || throw(UnsupportedFeatureException()) - elseif msg.msg_type == HM_DATA_LAYOUT - jlread(cio, UInt8) == 4 || throw(UnsupportedVersionException()) - storage_type = jlread(cio, UInt8) - if storage_type == LC_COMPACT_STORAGE - data_length = jlread(cio, UInt16) - data_offset = position(cio) - elseif storage_type == LC_CONTIGUOUS_STORAGE - data_offset = fileoffset(f, jlread(cio, RelOffset)) - data_length = jlread(cio, Length) - elseif storage_type == LC_CHUNKED_STORAGE - # TODO: validate this - flags = jlread(cio, UInt8) - dimensionality = jlread(cio, UInt8) - dimensionality_size = jlread(cio, UInt8) - skip(cio, Int(dimensionality)*Int(dimensionality_size)) - - chunk_indexing_type = jlread(cio, UInt8) - chunk_indexing_type == 1 || throw(UnsupportedFeatureException("Unknown chunk indexing type")) - data_length = jlread(cio, Length) - jlread(cio, UInt32) - data_offset = fileoffset(f, jlread(cio, RelOffset)) - chunked_storage = true - else - throw(UnsupportedFeatureException("Unknown data layout")) - end - elseif msg.msg_type == HM_FILTER_PIPELINE - version = jlread(cio, UInt8) - version == 2 || throw(UnsupportedVersionException("Filter Pipeline Message version $version is not implemented")) - nfilters = jlread(cio, UInt8) - nfilters == 1 || throw(UnsupportedFeatureException()) - filter_id = jlread(cio, UInt16) - issupported_filter(filter_id) || throw(UnsupportedFeatureException("Unknown Compression Filter $filter_id")) - elseif msg.msg_type == HM_ATTRIBUTE - if attrs === EMPTY_READ_ATTRIBUTES - attrs = ReadAttribute[read_attribute(cio, f)] - else - push!(attrs, read_attribute(cio, f)) + layout::DataLayout = DataLayout(0,0,0,-1) + filter_pipeline::FilterPipeline = FilterPipeline(Filter[]) + while !isempty(chunks) + chunk = popfirst!(chunks) + chunk_start = chunk.chunk_start + chunk_end = chunk.chunk_end + + if chunk_number > 0 # Don't do this the first time around + seek(io, chunk_start) + if header_version == 2 + chunk_end -= 4 + cio = begin_checksum_read(io) + jlread(cio, UInt32) == OBJECT_HEADER_CONTINUATION_SIGNATURE || throw(InvalidDataException()) end - elseif (msg.flags & 2^3) != 0 - throw(UnsupportedFeatureException()) end - seek(cio, endpos) - end - seek(cio, pmax) + chunk_number += 1 + + while position(cio) <= chunk_end-4 + if header_version == 1 + # Message start 8byte aligned relative to object start + skip_to_aligned!(cio, chunk_start) + # Version 1 header message is padded + msg = HeaderMessage(jlread(cio, UInt16), jlread(cio, UInt16), jlread(cio, UInt8)) + skip(cio, 3) + else # header_version == 2 + msg = jlread(cio, HeaderMessage) + (groupflags & 4) == 4 && skip(cio, 2) + end + endpos = position(cio) + msg.size + if msg.msg_type == HM_DATASPACE + dataspace = read_dataspace_message(cio) + elseif msg.msg_type == HM_DATATYPE + datatype_class, datatype_offset = read_datatype_message(cio, f, (msg.flags & 2) == 2) + elseif msg.msg_type == HM_FILL_VALUE_OLD + # don't know what to do with these + # ignore for now + elseif msg.msg_type == HM_FILL_VALUE + # don't know what to do with these + # ignore for now + version = jlread(cio, UInt8) + flags = jlread(cio, UInt8) + + elseif msg.msg_type == HM_DATA_LAYOUT + layout = jlread(cio, DataLayout, f) + elseif msg.msg_type == HM_FILTER_PIPELINE + filter_pipeline = jlread(cio, FilterPipeline) + elseif msg.msg_type == HM_ATTRIBUTE + if attrs === EMPTY_READ_ATTRIBUTES + attrs = ReadAttribute[read_attribute(cio, f)] + else + push!(attrs, read_attribute(cio, f)) + end + elseif msg.msg_type == HM_OBJECT_HEADER_CONTINUATION + continuation_offset = fileoffset(f, jlread(cio, RelOffset)) + continuation_length = jlread(cio, Length) + push!(chunks, (; chunk_start = continuation_offset, + chunk_end = continuation_offset + continuation_length)) - filter_id != 0 && !chunked_storage && throw(InvalidDataException("Compressed data must be chunked")) + elseif (msg.flags & 2^3) != 0 + throw(UnsupportedFeatureException()) + end + seek(cio, endpos) + end + seek(cio, chunk_end) - # Checksum - end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException("Invalid Checksum")) + if header_version == 2 + # Checksum + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException("Invalid Checksum")) + end + end + iscompressed(filter_pipeline) && !ischunked(layout) && throw(InvalidDataException("Compressed data must be chunked")) # TODO verify that data length matches - val = read_data(f, dataspace, datatype_class, datatype_offset, data_offset, data_length, - filter_id, offset, attrs) + val = read_data(f, dataspace, datatype_class, datatype_offset, layout, + filter_pipeline, offset, attrs) val end + """ read_attr_data(f::JLDFile, attr::ReadAttribute) @@ -101,7 +135,7 @@ jlread data from an attribute. """ read_attr_data(f::JLDFile, attr::ReadAttribute) = read_data(f, attr.dataspace, attr.datatype_class, attr.datatype_offset, - attr.data_offset) + DataLayout(0,0,-1,attr.data_offset)) """ read_attr_data(f::JLDFile, attr::ReadAttribute, expected_datatype::H5Datatype, @@ -114,12 +148,12 @@ better type stability while simultaneously validating the data. function read_attr_data(f::JLDFile, attr::ReadAttribute, expected_datatype::H5Datatype, rr::ReadRepresentation) io = f.io - if attr.datatype_class == class(expected_datatype) + if (attr.datatype_class << 4) == (class(expected_datatype) << 4) seek(io, attr.datatype_offset) dt = jlread(io, typeof(expected_datatype)) if dt == expected_datatype seek(f.io, attr.data_offset) - read_dataspace = (attr.dataspace, NULL_REFERENCE, -1, UInt16(0)) + read_dataspace = (attr.dataspace, NULL_REFERENCE, DataLayout(0,0,-1,attr.data_offset), FilterPipeline()) return read_data(f, rr, read_dataspace) end end @@ -128,7 +162,7 @@ end """ read_data(f::JLDFile, dataspace::ReadDataspace, datatype_class::UInt8, - datatype_offset::Int64, data_offset::Int64[, filter_id::UInt16, + datatype_offset::Int64, data_offset::Int64[, filters::FilterPipeline, header_offset::RelOffset, attributes::Vector{ReadAttribute}]) Read data from a file. If `datatype_class` is typemax(UInt8), the datatype is assumed to be @@ -137,22 +171,56 @@ Otherwise, datatype_offset points to the offset of the datatype attribute. """ function read_data(f::JLDFile, dataspace::ReadDataspace, datatype_class::UInt8, datatype_offset::Int64, - data_offset::Int64, data_length::Int=-1, filter_id::UInt16=UInt16(0), + layout::DataLayout, + filters::FilterPipeline=FilterPipeline(), header_offset::RelOffset=NULL_REFERENCE, attributes::Union{Vector{ReadAttribute},Nothing}=nothing) # See if there is a julia type attribute io = f.io - if datatype_class == typemax(UInt8) # Committed datatype - rr = jltype(f, f.datatype_locations[h5offset(f, datatype_offset)]) - seek(io, data_offset) - read_dataspace = (dataspace, header_offset, data_length, filter_id) + if datatype_class == typemax(UInt8) # shared datatype message + # this means that it is "committed" to `_types` if the file was written by JLD2 + offset = h5offset(f, datatype_offset) + rr = jltype(f, get(f.datatype_locations, offset, SharedDatatype(offset))) + + if layout.data_offset == -1 + # There was no layout message. + # That means, this dataset is just a datatype + # return the Datatype + return typeof(rr).parameters[1] + end + + seek(io, layout.data_offset) + read_dataspace = (dataspace, header_offset, layout, filters) read_data(f, rr, read_dataspace, attributes) - else + + elseif layout.data_offset == typemax(Int64) seek(io, datatype_offset) @read_datatype io datatype_class dt begin rr = jltype(f, dt) - seek(io, data_offset) - read_dataspace = (dataspace, header_offset, data_length, filter_id) + T,S = typeof(rr).parameters + if layout.data_length > -1 + # TODO: this could use the fill value message to populate the array + @warn "This array should be populated by a fill value. This is not (yet) implemented." + end + v = Array{T, 1}() + header_offset !== NULL_REFERENCE && (f.jloffset[header_offset] = WeakRef(v)) + return v + end + else + seek(io, datatype_offset) + @read_datatype io datatype_class dt begin + dtt = dt + rr = jltype(f, dtt) + + if layout.data_offset == -1 + # There was no layout message. + # That means, this dataset is just a datatype + # return the Datatype + return typeof(rr).parameters[1] + end + + seek(io, layout.data_offset) + read_dataspace = (dataspace, header_offset, layout, filters) read_data(f, rr, read_dataspace, attributes) end end @@ -161,15 +229,19 @@ end # Most types can only be scalars or arrays function read_data(f::JLDFile, @nospecialize(rr), - read_dataspace::Tuple{ReadDataspace,RelOffset,Int,UInt16}, + read_dataspace::Tuple{ReadDataspace,RelOffset,DataLayout,FilterPipeline}, attributes::Union{Vector{ReadAttribute},Nothing}=nothing) - dataspace, header_offset, data_length, filter_id = read_dataspace + dataspace, header_offset, layout, filters = read_dataspace if dataspace.dataspace_type == DS_SCALAR - filter_id != 0 && throw(UnsupportedFeatureException()) + iscompressed(filters) && throw(UnsupportedFeatureException()) read_scalar(f, rr, header_offset) elseif dataspace.dataspace_type == DS_SIMPLE - read_array(f, dataspace, rr, data_length, filter_id, header_offset, attributes) + read_array(f, dataspace, rr, layout, filters, header_offset, attributes) + elseif dataspace.dataspace_type == DS_V1 && dataspace.dimensionality == 0 + read_scalar(f, rr, header_offset) + elseif dataspace.dataspace_type == DS_V1 + read_array(f, dataspace, rr, layout, filters, header_offset, attributes) else throw(UnsupportedFeatureException()) end @@ -178,11 +250,11 @@ end # Reference arrays can only be arrays or null dataspace (for Union{} case) function read_data(f::JLDFile, rr::ReadRepresentation{Any,RelOffset}, - read_dataspace::Tuple{ReadDataspace,RelOffset,Int,UInt16}, + read_dataspace::Tuple{ReadDataspace,RelOffset,DataLayout,FilterPipeline}, attributes::Vector{ReadAttribute}) - dataspace, header_offset, data_length, filter_id = read_dataspace - filter_id != 0 && throw(UnsupportedFeatureException()) + dataspace, header_offset, layout, filters = read_dataspace + iscompressed(filters) && throw(UnsupportedFeatureException()) if dataspace.dataspace_type == DS_SIMPLE # Since this is an array of references, there should be an attribute # informing us of the type @@ -198,25 +270,28 @@ function read_data(f::JLDFile, end seek(io, startpos) return read_array(f, dataspace, ReadRepresentation{T,RelOffset}(), - -1, UInt16(0), header_offset, attributes) + layout, FilterPipeline(), header_offset, attributes) end end elseif dataspace.dataspace_type == DS_NULL return read_empty(ReadRepresentation{Union{},nothing}(), f, attributes[find_dimensions_attr(attributes)], header_offset) + elseif dataspace.dataspace_type == DS_V1 + return read_array(f, dataspace, ReadRepresentation{Any,RelOffset}(), + layout, FilterPipeline(), header_offset, attributes) end - throw(UnsupportedFeatureException()) + throw(UnsupportedFeatureException("Dataspace type $(dataspace.dataspace_type) not implemented")) end # Types with no payload can only be null dataspace function read_data(f::JLDFile, rr::Union{ReadRepresentation{T,nothing} where T, ReadRepresentation{T,CustomSerialization{S,nothing}} where {S,T}}, - read_dataspace::Tuple{ReadDataspace,RelOffset,Int,UInt16}, + read_dataspace::Tuple{ReadDataspace,RelOffset,DataLayout,FilterPipeline}, attributes::Vector{ReadAttribute}) - dataspace, header_offset, data_length, filter_id = read_dataspace - filter_id != 0 && throw(UnsupportedFeatureException()) + dataspace, header_offset, layout, filters = read_dataspace + iscompressed(filters) && throw(UnsupportedFeatureException()) dataspace.dataspace_type == DS_NULL || throw(UnsupportedFeatureException()) dimensions_attr_index = find_dimensions_attr(attributes) @@ -312,24 +387,71 @@ function construct_array(io::IO, ::Type{T}, ::Val{N})::Array{T,N} where {T,N} end function read_array(f::JLDFile, dataspace::ReadDataspace, - rr::ReadRepresentation{T,RR}, data_length::Int, - filter_id::UInt16, header_offset::RelOffset, + rr::ReadRepresentation{T,RR}, layout::DataLayout, + filters::FilterPipeline, header_offset::RelOffset, attributes::Union{Vector{ReadAttribute},Nothing}) where {T,RR} io = f.io - data_offset = position(io) - ndims, offset = get_ndims_offset(f, dataspace, attributes) - - seek(io, offset) - v = construct_array(io, T, Val(Int(ndims))) - n = length(v) - seek(io, data_offset) - if filter_id !=0 - read_compressed_array!(v, f, rr, data_length, filter_id) + data_offset = layout.data_offset + if !ischunked(layout) || (layout.chunk_indexing_type == 1) + #data_offset = position(io) + ndims, offset = get_ndims_offset(f, dataspace, attributes) + + seek(io, offset) + v = construct_array(io, T, Val(Int(ndims))) + n = length(v) + seek(io, data_offset) + if iscompressed(filters) + read_compressed_array!(v, f, rr, layout.data_length, filters) + else + read_array!(v, f, rr) + end + header_offset !== NULL_REFERENCE && (f.jloffset[header_offset] = WeakRef(v)) + v else - read_array!(v, f, rr) + ndims, offset = get_ndims_offset(f, dataspace, attributes) + seek(io, offset) + v = construct_array(io, T, Val(Int(ndims))) + if layout.version == 3 + # version 1 B-tree + # This version appears to be padding incomplete chunks + chunks = read_v1btree_dataset_chunks(f, h5offset(f, layout.data_offset), layout.dimensionality) + vchunk = Array{T, Int(ndims)}(undef, reverse(layout.chunk_dimensions)...) + for chunk in chunks + idx = reverse(chunk.idx[1:end-1]) + seek(io, fileoffset(f, chunk.offset)) + indexview = (:).(idx .+1, min.(idx .+ reverse(layout.chunk_dimensions), size(v))) + indexview2 = (:).(1, length.(indexview)) + + if iscompressed(filters) + if chunk.filter_mask == 0 + read_compressed_array!(vchunk, f, rr, chunk.chunk_size, filters) + v[indexview...] = @view vchunk[indexview2...] + else + if length(filters.filters) == 1 + read_array!(vchunk, f, rr) + v[indexview...] = @view vchunk[indexview2...] + else + mask = Bool[chunk.filter_mask & 2^(n-1) == 0 for n=eachindex(filters.filters)] + if any(mask) + rf = FilterPipeline(filters.filters[mask]) + read_compressed_array!(vchunk, f, rr, chunk.chunk_size, rf) + v[indexview...] = @view vchunk[indexview2...] + else + read_array!(vchunk, f, rr) + v[indexview...] = @view vchunk[indexview2...] + end + + end + end + else + read_array!(vchunk, f, rr) + v[indexview...] = @view vchunk[indexview2...] + end + end + return v + end + throw(UnsupportedVersionException("Encountered a chunked array ($layout) that is not implemented.")) end - header_offset !== NULL_REFERENCE && (f.jloffset[header_offset] = WeakRef(v)) - v end @@ -600,7 +722,7 @@ function delete_written_link!(f::JLDFile, roffset::RelOffset, name::AbstractStri chunk_start_offset::Int64 = fileoffset(f, roffset) seek(io, chunk_start_offset) - sz = read_obj_start(io) + sz, = read_obj_start(io) chunk_checksum_offset::Int64 = position(io) + sz continuation_offset::Int64 = -1 diff --git a/src/dataspaces.jl b/src/dataspaces.jl index 170219bb..e85e797e 100644 --- a/src/dataspaces.jl +++ b/src/dataspaces.jl @@ -6,6 +6,7 @@ const DS_SCALAR = 0x00 const DS_SIMPLE = 0x01 const DS_NULL = 0x02 +const DS_V1 = 0xff struct WriteDataspace{N,A<:Tuple} dataspace_type::UInt8 @@ -86,7 +87,14 @@ end # dataspace_dimensions are the corresponding dimensions function read_dataspace_message(io::IO) dspace_start = jlread(io, DataspaceStart) - dspace_start.version == 2 || throw(UnsupportedVersionException()) - dataspace_type = dspace_start.dataspace_type - ReadDataspace(dataspace_type, dspace_start.dimensionality, position(io)) -end + if dspace_start.version == 1 || dspace_start.version == 0 + skip(io, 4) # skip another 4 bytes + dataspace_type = DS_V1 + return ReadDataspace(dataspace_type, dspace_start.dimensionality, position(io)) + elseif dspace_start.version == 2 + dataspace_type = dspace_start.dataspace_type + return ReadDataspace(dataspace_type, dspace_start.dimensionality, position(io)) + else + throw(UnsupportedVersionException("Dataspace Messages version $(dspace_start.version)")) + end +end \ No newline at end of file diff --git a/src/datatypes.jl b/src/datatypes.jl index 4490fa8c..0147d197 100644 --- a/src/datatypes.jl +++ b/src/datatypes.jl @@ -2,6 +2,9 @@ # Datatypes # +# Datatype is encoded in the lower four bytes (0-10) +# upper four bytes encode variant used. JLD2 always uses variant 3 +# detail in the hdf5 format spec const DT_FIXED_POINT = UInt8(0) | (UInt8(3) << 4) const DT_FLOATING_POINT = UInt8(1) | (UInt8(3) << 4) const DT_TIME = UInt8(2) | (UInt8(3) << 4) @@ -14,6 +17,19 @@ const DT_ENUMERATED = UInt8(8) | (UInt8(3) << 4) const DT_VARIABLE_LENGTH = UInt8(9) | (UInt8(3) << 4) const DT_ARRAY = UInt8(10) | (UInt8(3) << 4) +const DATATYPES = Dict{UInt8, String}( + 0 => "DT_FIXED_POINT", + 1 => "DT_FLOATING_POINT", + 2 => "DT_TIME", + 3 => "DT_STRING", + 4 => "DT_BITFIELD", + 5 => "DT_OPAQUE", + 6 => "DT_COMPOUND", + 7 => "DT_REFERENCE", + 8 => "DT_ENUMERATED", + 9 => "DT_VARIABLE_LENGTH", + 10=> "DT_ARRAY") + # This is the description for: # Strings # Opaque datatypes @@ -33,6 +49,15 @@ OpaqueDatatype(size::Integer) = ReferenceDatatype() = BasicDatatype(DT_REFERENCE, 0x00, 0x00, 0x00, jlsizeof(RelOffset)) +function Base.:(==)(dt1::BasicDatatype, dt2::BasicDatatype) + ret = true + ret &= (dt1.class << 4) == (dt2.class << 4) + ret &= dt1.bitfield1 == dt2.bitfield1 + ret &= dt1.bitfield2 == dt2.bitfield2 + ret &= dt1.bitfield3 == dt2.bitfield3 + ret &= dt1.size == dt2.size + ret +end # Reads a datatype message and returns a (offset::RelOffset, class::UInt8) # tuple. If the datatype is committed, the offset is the offset of the # committed datatype and the class is typemax(UInt8). Otherwise, the @@ -41,12 +66,16 @@ ReferenceDatatype() = function read_datatype_message(io::IO, f::JLDFile, committed) if committed # Shared datatype - jlread(io, UInt8) == 3 || throw(UnsupportedVersionException()) - jlread(io, UInt8) == 2 || throw(UnsupportedFeatureException()) + version = jlread(io, UInt8) + msgtype = jlread(io, UInt8) + # supported combinations are + (version == 3 && msgtype == 2) || (version == 2) || throw(UnsupportedVersionException("Unsupported shared message")) + (typemax(UInt8), Int64(fileoffset(f, jlread(io, RelOffset)))) else # Datatype stored here - (jlread(io, UInt8), Int64(position(io)-1)) + class = jlread(io, UInt8) + (class, Int64(position(io)-1)) end end @@ -68,20 +97,26 @@ end # dispatch for all but variable length types macro read_datatype(io, datatype_class, datatype, then) esc(quote - if $datatype_class == DT_FIXED_POINT + if $datatype_class << 4 == DT_FIXED_POINT << 4 $(replace_expr(then, datatype, :(jlread($io, FixedPointDatatype)))) - elseif $datatype_class == DT_FLOATING_POINT + elseif $datatype_class << 4 == DT_FLOATING_POINT << 4 $(replace_expr(then, datatype, :(jlread($io, FloatingPointDatatype)))) - elseif $datatype_class == DT_STRING || $datatype_class == DT_OPAQUE || $datatype_class == DT_REFERENCE + elseif $datatype_class << 4 == DT_STRING << 4 || $datatype_class << 4 == DT_OPAQUE << 4 || $datatype_class << 4 == DT_REFERENCE << 4 $(replace_expr(then, datatype, :(jlread($io, BasicDatatype)))) - elseif $datatype_class == DT_COMPOUND + elseif $datatype_class << 4 == DT_COMPOUND << 4 $(replace_expr(then, datatype, :(jlread($io, CompoundDatatype)))) - elseif $datatype_class == DT_VARIABLE_LENGTH + elseif $datatype_class << 4 == DT_VARIABLE_LENGTH << 4 $(replace_expr(then, datatype, :(jlread($io, VariableLengthDatatype)))) - elseif $datatype_class == DT_BITFIELD + elseif $datatype_class << 4 == DT_BITFIELD << 4 $(replace_expr(then, datatype, :(jlread($io, BitFieldDatatype)))) + elseif $datatype_class << 4 == DT_TIME << 4 + $(replace_expr(then, datatype, :(jlread($io, TimeDatatype)))) + elseif $datatype_class << 4 == DT_ARRAY << 4 + $(replace_expr(then, datatype, :(jlread($io, ArrayDatatype)))) + elseif $datatype_class << 4 == DT_ENUMERATED << 4 + $(replace_expr(then, datatype, :(jlread($io, EnumerationDatatype)))) else - throw(UnsupportedFeatureException()) + throw(UnsupportedFeatureException("invalid datatype class $datatype_class")) end end) end @@ -131,6 +166,23 @@ define_packed(FloatingPointDatatype) class(dt::Union{BasicDatatype,FixedPointDatatype,FloatingPointDatatype}) = dt.class +function Base.:(==)(fp1::FloatingPointDatatype, fp2::FloatingPointDatatype) + ret = true + ret &= (fp1.class << 4) == (fp2.class << 4) # compare only class and not version + ret &= fp1.bitfield1 == fp2.bitfield1 + ret &= fp1.bitfield2 == fp2.bitfield2 + ret &= fp1.bitfield3 == fp2.bitfield3 + ret &= fp1.size == fp2.size + ret &= fp1.bitoffset == fp2.bitoffset + ret &= fp1.bitprecision == fp2.bitprecision + ret &= fp1.exponentlocation == fp2.exponentlocation + ret &= fp1.exponentsize == fp2.exponentsize + ret &= fp1.mantissalocation == fp2.mantissalocation + ret &= fp1.mantissasize == fp2.mantissasize + ret &= fp1.exponentbias == fp2.exponentbias + ret +end + struct CompoundDatatype <: H5Datatype size::UInt32 names::Vector{Symbol} @@ -144,6 +196,12 @@ struct CompoundDatatype <: H5Datatype end end +function jltype(f::JLDFile, dt::CompoundDatatype) + odr = reconstruct_odr(f, dt, RelOffset[]) + T = NamedTuple{tuple(dt.names...), typeof(odr).parameters[2]} + return ReadRepresentation{T, odr}() +end + Base.:(==)(x::CompoundDatatype, y::CompoundDatatype) = x.size == y.size && x.names == y.names && x.offsets == y.offsets && x.members == y.members @@ -184,6 +242,7 @@ end function jlread(io::IO, ::Type{CompoundDatatype}) dt = jlread(io, BasicDatatype) + version = dt.class >> 4 nfields = UInt16(dt.bitfield1) | UInt16(dt.bitfield2 << 8) dt.bitfield3 == 0 || throw(UnsupportedFeatureException()) @@ -193,9 +252,11 @@ function jlread(io::IO, ::Type{CompoundDatatype}) for i = 1:nfields # Name names[i] = Symbol(read_bytestring(io)) - # Byte offset of member - if dt.size <= typemax(UInt8) + if version == 2 || version == 1 + skip(io, 8-mod1(sizeof(names[i]),8)-1) + offsets[i] = jlread(io, UInt32) + elseif dt.size <= typemax(UInt8) offsets[i] = jlread(io, UInt8) elseif dt.size <= typemax(UInt16) offsets[i] = jlread(io, UInt16) @@ -203,6 +264,16 @@ function jlread(io::IO, ::Type{CompoundDatatype}) offsets[i] = jlread(io, UInt32) end + if version == 1 + # supports array members + # can encode dimensionality here + dimensionality = jlread(io, UInt8) + skip(io, 3) + skip(io, 4) # dimension permutation + skip(io, 4) + skip(io, 16) + end + # Member type message datatype_class = jlread(io, UInt8) skip(io, -1) @@ -286,33 +357,87 @@ function commit(f::JLDFile, end # Read the actual datatype for a committed datatype -function read_committed_datatype(f::JLDFile, cdt::CommittedDatatype) +function read_shared_datatype(f::JLDFile, cdt::Union{SharedDatatype, CommittedDatatype}) io = f.io - seek(io, fileoffset(f, cdt.header_offset)) - cio = begin_checksum_read(io) - sz = read_obj_start(cio) - pmax = position(cio) + sz + chunk_start::Int64 = fileoffset(f, cdt.header_offset) + seek(io, chunk_start) + + header_version = jlread(io, UInt8) + if header_version == 1 + seek(io, chunk_start) + cio = io + sz, = read_obj_start(cio) + chunk_end = position(cio) + sz + # Skip to nearest 8byte aligned position + skip_to_aligned!(cio, chunk_start) + else + header_version = 2 + seek(io, chunk_start) + cio = begin_checksum_read(io) + sz, = read_obj_start(cio) + chunk_end = position(cio) + sz + end + # Messages + chunk_end::Int64 + continuation_message_goes_here::Int64 = -1 + chunks = [(; chunk_start, chunk_end)] + chunk_number = 0 # Messages datatype_class::UInt8 = 0 datatype_offset::Int = 0 attrs = ReadAttribute[] - while position(cio) < pmax - msg = jlread(cio, HeaderMessage) - endpos = position(cio) + msg.size - if msg.msg_type == HM_DATATYPE - # Datatype stored here - datatype_offset = position(cio) - datatype_class = jlread(cio, UInt8) - elseif msg.msg_type == HM_ATTRIBUTE - push!(attrs, read_attribute(cio, f)) + + + while !isempty(chunks) + chunk = popfirst!(chunks) + chunk_start = chunk.chunk_start + chunk_end = chunk.chunk_end + + if chunk_number > 0 + seek(io, chunk_start) + chunk_end -= 4 + if header_version == 2 + cio = begin_checksum_read(io) + jlread(cio, UInt32) == OBJECT_HEADER_CONTINUATION_SIGNATURE || throw(InvalidDataException()) + end + end + chunk_number += 1 + while (curpos = position(cio)) < chunk_end-4 + if header_version == 1 + # Message start 8byte aligned relative to object start + skip_to_aligned!(cio, chunk_start) + # Version 1 header message is padded + msg = HeaderMessage(jlread(cio, UInt16), jlread(cio, UInt16), jlread(cio, UInt8)) + skip(cio, 3) + else # header_version == 2 + msg = jlread(cio, HeaderMessage) + end + endpos = position(cio) + msg.size + if msg.msg_type == HM_DATATYPE + # Datatype stored here + datatype_offset = position(cio) + datatype_class = jlread(cio, UInt8) + elseif msg.msg_type == HM_ATTRIBUTE + push!(attrs, read_attribute(cio, f)) + elseif msg.msg_type == HM_OBJECT_HEADER_CONTINUATION + cont_chunk_start = fileoffset(f, jlread(cio, RelOffset)) + chunk_length = jlread(cio, Length) + push!(chunks, (;chunk_start=cont_chunk_start, + chunk_end =cont_chunk_start+chunk_length)) + elseif (msg.flags & 2^3) != 0 + throw(UnsupportedFeatureException()) + end + seek(cio, endpos) end - seek(cio, endpos) - end - seek(cio, pmax) - # Checksum - end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException()) + # Checksum + #seek(cio, chunk_end) + if header_version == 2 + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException()) + end + seek(cio, chunk_end) + end seek(io, datatype_offset) @read_datatype io datatype_class dt begin @@ -320,7 +445,91 @@ function read_committed_datatype(f::JLDFile, cdt::CommittedDatatype) end end - struct FixedLengthString{T<:AbstractString} length::Int end + + +struct ArrayDatatype <: H5Datatype + class::UInt8 + bitfield1::UInt8 + bitfield2::UInt8 + bitfield3::UInt8 + dimensionality::UInt8 + dims::Vector{UInt32} + base_type::H5Datatype +end + +function jlread(io::IO, ::Type{ArrayDatatype}) + dt = jlread(io, BasicDatatype) + version = dt.class >> 4 + dimensionality = jlread(io, UInt8) + version == 2 && skip(io, 3) + dims = jlread(io, UInt32, dimensionality) + if version == 2 + # unsupported permutation index + skip(io, 4*dimensionality) + end + + datatype_class = jlread(io, UInt8) + skip(io, -1) + @read_datatype io datatype_class base_type begin + ArrayDatatype(dt.class, 0x0, 0x0, 0x0, dimensionality, dims, base_type) + end +end + +struct ArrayPlaceHolder{T, D} end + +odr_sizeof(::Type{ArrayPlaceHolder{T,D}}) where {T,D} = odr_sizeof(T)*prod(D) + +function jltype(f::JLDFile, dt::ArrayDatatype) + rr = jltype(f, dt.base_type) + T = typeof(rr).parameters[1] + ReadRepresentation{Array{T, Int(dt.dimensionality)}, ArrayPlaceHolder{rr, tuple(dt.dims...)}}() +end + + +function jlconvert(::ReadRepresentation{Array{T,D}, ArrayPlaceHolder{RR, DIMS}}, f::JLDFile, ptr::Ptr, + header_offset::RelOffset) where {T, D, RR, DIMS} + v = Array{T, D}(undef, reverse(DIMS)...) + for i=1:prod(DIMS) + v[i] = jlconvert(RR, f, ptr, header_offset) + ptr += jlsizeof(typeof(RR).parameters[2]) + end + return v +end + +struct EnumerationDatatype <: H5Datatype + class::UInt8 + bitfield1::UInt8 + bitfield2::UInt8 + bitfield3::UInt8 + base_type::DataType + names::Vector{String} + values::Vector{<:Any} +end + +function jlread(io::IO, ::Type{EnumerationDatatype}) + dt = jlread(io, BasicDatatype) + version = dt.class >> 4 + num_members = dt.bitfield1 | UInt16(dt.bitfield2)<<8 + base_type = jlread(io, BasicDatatype) + T = uintofsize(base_type.size) + # assume that it is an Integer + names = String[] + values = Any[] + for _=1:num_members + name = read_bytestring(io) + push!(names, name) + version < 3 && skip(io, 8-mod1(sizeof(name), 8)) + push!(values, jlread(io, T)) + end + return EnumerationDatatype(dt.class, dt.bitfield1, dt.bitfield2, dt.bitfield3, + T, names, values) +end + +function jltype(f::JLDFile, dt::EnumerationDatatype) + ReadRepresentation{dt.base_type, dt.base_type}() +end + +# Can't read big endian ints diff --git a/src/file_header.jl b/src/file_header.jl index f5b67217..fba0e8de 100644 --- a/src/file_header.jl +++ b/src/file_header.jl @@ -19,6 +19,10 @@ const LEGACY_REQUIRED_FILE_HEADER = "Julia data file (HDF5), version 0.2.0" function verify_file_header(f) io = f.io fname = f.path + if f.base_address != FILE_HEADER_LENGTH + @warn "File likely not written by JLD2. Skipping header verification." + return + end seek(io, 0) headermsg = String(read!(io, Vector{UInt8}(undef, length(REQUIRED_FILE_HEADER)))) if headermsg != REQUIRED_FILE_HEADER @@ -40,3 +44,15 @@ function verify_file_header(f) Attempting to load data.""", maxlog=1) end end + +function write_file_header(f) + io = f.io + if f.base_address >= FILE_HEADER_LENGTH + seek(io, f.base_address - FILE_HEADER_LENGTH) + jlwrite(io, FILE_HEADER) + end + # Write superblock + seek(io, f.base_address) + write_superblock(io,f) + return nothing +end \ No newline at end of file diff --git a/src/fractal_heaps.jl b/src/fractal_heaps.jl new file mode 100644 index 00000000..5c355c4e --- /dev/null +++ b/src/fractal_heaps.jl @@ -0,0 +1,513 @@ +const FRACTAL_HEAP_HEADER_SIGNATURE = htol(0x50485246) # UInt8['F','R','H','P'] +const FRACTAL_HEAP_INDIRECT_BLOCK_SIGNATURE = htol(0x42494846) # UInt8['F','H','I','B'] +const FRACTAL_HEAP_DIRECT_BLOCK_SIGNATURE = htol(0x42444846) # UInt8['F', 'H', 'D', 'B'] + +struct FractalHeapHeader + offset::RelOffset + table_width::Int + starting_block_size::Int + max_direct_block_size::Int + max_heap_size::Int + root_block_address::RelOffset + cur_num_rows_in_root_iblock::Int + has_io_filter::Bool + max_dblock_rows::Int + max_size_managed_objects::Int + # could add the rest of the fields if they ever become necessary +end + +struct FractalHeapDirectBlock + offset::RelOffset # position of block in file + # block offset in heaps address space + # WARNING: don't use. sometimes wrong in long files + block_offset::UInt64 + size::UInt64 + filtered_size::UInt64 # set to typemax if not filtered + filter_mask::UInt32 # set to typemax if not filtered +end + +struct FractalHeapIndirectBlock + offset::RelOffset # position of iblock in file + block_offset::UInt64 # block offset in heaps address space + dblocks::Vector{FractalHeapDirectBlock} + iblocks::Vector{FractalHeapIndirectBlock} +end + +function blocksize(blocknum, starting_size, table_width) + #block numbering starts at zero + rownum = Int(blocknum ÷ table_width) + (2^(max(0,rownum-1))) * starting_size +end + +function block_num_size_start(offset, hh) + width = hh.table_width + # first compute row number + r = Int(offset ÷ (hh.starting_block_size*width)) + r > 2 && (r = ceil(Int, log2(r+1))) + # row start offset + row_startoffset = (r>1 ? 2^(r-1) : r)*hh.starting_block_size*width + block_size = (2^(max(0,r-1))) * hh.starting_block_size + block_num = width*r + (offset-row_startoffset) ÷ block_size + block_start = row_startoffset + block_size*(block_num-width*r) + block_num, block_size, block_start +end + + +function read_fractal_heap_header(f, offset) + io = f.io + seek(io, fileoffset(f, offset)) # may need to compute fileoffset + cio = begin_checksum_read(io) + + signature = jlread(cio, UInt32) + signature == FRACTAL_HEAP_HEADER_SIGNATURE || throw(InvalidDataException("Signature does not match.")) + + version = jlread(cio, UInt8) + heap_id_length = jlread(cio, UInt16) + io_filter_encoded_length = jlread(cio, UInt16) + flags = jlread(cio, UInt8) + max_size_managed_objects = jlread(cio, UInt32) + next_huge_object_id = jlread(cio, Length) + huge_object_v2btree_address = jlread(cio, RelOffset) + free_space_in_managed_blocks = jlread(cio, Length) + managed_block_free_space_manager = jlread(cio, RelOffset) + managed_space_in_heap = jlread(cio, Length) + allocated_space_in_heap = jlread(cio, Length) + direct_block_allocation_iterator_offset = jlread(cio, Length) + managed_objects_number_in_heap = jlread(cio, Length) + huge_objects_size_in_heap = jlread(cio, Length) + huge_objects_number_in_heap = jlread(cio, Length) + tiny_objects_size_in_heap = jlread(cio, Length) + tiny_objects_number_in_heap = jlread(cio, Length) + + table_width = jlread(cio, UInt16) + starting_block_size = jlread(cio, Length) + max_direct_block_size = jlread(cio, Length) + max_heap_size = jlread(cio, UInt16) + num_starting_rows_in_root_iblock = jlread(cio, UInt16) + root_block_address = jlread(cio, RelOffset) + cur_num_rows_in_root_iblock = jlread(cio, UInt16) + + has_io_filter = io_filter_encoded_length > 0 + if has_io_filter + filtered_root_direct_block_size = jlread(cio, Length) + io_filter_mask = jlread(cio, UInt32) + io_filter_information = jlread(cio, UInt8, io_filter_encoded_length) + else + filtered_root_direct_block_size = typemax(Length) + io_filter_mask = typemax(UInt32) + io_filter_information = UInt8[] + end + + # Checksum + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException("Invalid Checksum")) + + max_dblock_rows = (log2(max_direct_block_size) - log2(starting_block_size))+2 |> Int + + FractalHeapHeader(offset, table_width, starting_block_size, max_direct_block_size, max_heap_size, + root_block_address, cur_num_rows_in_root_iblock, has_io_filter, max_dblock_rows, + max_size_managed_objects) +end + +function read_indirect_block(f, offset, hh, nrows::Int) + io = f.io + seek(io, fileoffset(f, offset)) + cio = begin_checksum_read(io) + + signature = jlread(cio, UInt32) + signature == FRACTAL_HEAP_INDIRECT_BLOCK_SIGNATURE || throw(InvalidDataException("Signature does not match.")) + + version = jlread(cio, UInt8) + heap_header_address = jlread(cio, RelOffset) + # number of bytes for block offset + offset_byte_num = ceil(Int, hh.max_heap_size / 8) + block_offset = to_uint64(jlread(cio, UInt8, offset_byte_num)) + + # Read child direct blocks + block_start = block_offset + K = min(nrows, hh.max_dblock_rows)*hh.table_width + dblocks = map(1:K) do k + dblock_address = jlread(cio, RelOffset) + dblock_size = blocksize(k-1, hh.starting_block_size, hh.table_width) + if hh.has_io_filter > 0 + filtered_size = jlread(cio, Length) + filter_mask = jlread(cio, UInt32) + else + filtered_size = typemax(Length) + filter_mask = typemax(UInt32) + end + dblock = FractalHeapDirectBlock(dblock_address, block_start, dblock_size, filtered_size, filter_mask) + block_start += dblock_size + return dblock + end + N = (nrows <= hh.max_dblock_rows) ? 0 : (nrows-hh.max_dblock_rows)*hh.table_width + iblock_addresses = map(1:N) do n + jlread(cio, RelOffset) + end + + # Checksum + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException()) + + iblocks = Vector{FractalHeapIndirectBlock}(undef, N) + for n=1:N + iblock_offset = iblock_addresses[n] + iblock_offset == UNDEFINED_ADDRESS && break + # figure out iblock size / nrows + block_num = K+(n-1) + rownum = block_num ÷ hh.table_width + block_size = (2^(max(0,rownum-1))) * hh.starting_block_size + sub_iblock_nrows::Int = (log2(block_size)-log2(hh.starting_block_size* hh.table_width))+1 + iblocks[n] = read_indirect_block(f, iblock_offset , hh, sub_iblock_nrows) + end + FractalHeapIndirectBlock(offset, block_offset, dblocks, iblocks) +end + +##################################################################################################### +## Version 2 B-trees +##################################################################################################### + +const V2_BTREE_HEADER_SIGNATURE = htol(0x44485442) # UInt8['B','T','H','D'] +const V2_BTREE_INTERNAL_NODE_SIGNATURE = htol(0x4e495442) # UInt8['B', 'T', 'I', 'N'] +const V2_BTREE_LEAF_NODE_SIGNATURE = htol(0x464c5442) # UInt8['B', 'T', 'L', 'F'] + +struct BTreeHeaderV2 + offset::RelOffset + type::Int + node_size::Int + record_size::Int + depth::Int + split_percent::Int + merge_percent::Int + root_node_address::RelOffset + num_records_in_root_node::Int + num_records_in_tree::Int +end + +abstract type BTreeNodeV2 end +abstract type BTreeRecordV2 end + +struct BTreeInternalNodeV2 <: BTreeNodeV2 + offset::RelOffset + type::UInt8 + records::Vector{Any} + child_nodes::Vector #abstract to defer loading +end + +struct BTreeLeafNodeV2 <: BTreeNodeV2 + offset::RelOffset + type::UInt8 + records::Vector{<:BTreeRecordV2} +end + +struct BTreeType5RecordV2 <: BTreeRecordV2 + hash::UInt32 + offset::UInt64 + length::Int +end + +function read_v2btree_header(f, offset) + io = f.io + seek(io, fileoffset(f, offset)) + cio = begin_checksum_read(io) + + signature = jlread(cio, UInt32) + signature == V2_BTREE_HEADER_SIGNATURE || throw(InvalidDataException("Signature does not match.")) + + version = jlread(cio, UInt8) + type = jlread(cio, UInt8) + node_size = jlread(cio, UInt32) + record_size = jlread(cio, UInt16) + depth = jlread(cio, UInt16) + split_percent = jlread(cio, UInt8) + merge_percent = jlread(cio, UInt8) + root_node_address = jlread(cio, RelOffset) + num_records_in_root_node = jlread(cio, UInt16) + num_records_in_tree = jlread(cio, Length) + + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException()) + BTreeHeaderV2( offset, + type, + node_size, + record_size, + depth, + split_percent, + merge_percent, + root_node_address, + num_records_in_root_node, + num_records_in_tree) +end + + +function read_v2btree_node(f, offset, num_records, depth, bh, hh) + if depth == 0 + return read_v2btree_leaf_node(f, offset, num_records, bh, hh) + end + io = f.io + seek(io, fileoffset(f, offset)) # may need to compute fileoffset + cio = begin_checksum_read(io) + + signature = jlread(cio, UInt32) + signature == V2_BTREE_INTERNAL_NODE_SIGNATURE || throw(InvalidDataException("Signature does not match.")) + + version = jlread(cio, UInt8) + type = jlread(cio, UInt8) + + records = map(1:num_records) do n + read_record(cio, type, hh) + end + + # determine number of bytes used to encode `num_records` + # this has to be done iteratively + # leaf node: + space = bh.node_size - 4 - 1 - 1 - 4 + max_records = space ÷ bh.record_size + max_records_total = 0 + numbytes = size_size(max_records) + numbytes_total = 0 + + for d = 1:depth + space = bh.node_size - 4-1-1-4 - sizeof(RelOffset) - (d>1)*numbytes_total + max_records = space ÷ (bh.record_size + sizeof(RelOffset) + numbytes+(d>1)*numbytes_total) + numbytes = size_size(max_records) + max_records_total = max_records + (max_records+1)*max_records_total + numbytes_total = size_size(max_records_total) + end + numbytes_total = size_size2(max_records_total) + child_nodes = map(1:num_records+1) do _ + child_node_pointer = jlread(cio, RelOffset) # offset type + num_records = Int(to_uint64(jlread(cio, UInt8, numbytes))) + if depth > 1 + total_records = Int(to_uint64(jlread(cio, UInt8, numbytes_total))) + return (; child_node_pointer, num_records,total_records) + end + (; child_node_pointer, num_records) + end + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException()) + + BTreeInternalNodeV2(offset, type, records, child_nodes) +end + + +function read_v2btree_leaf_node(f, offset, num_records, bh, hh) + io = f.io + seek(io, fileoffset(f, offset)) # may need to compute fileoffset + cio = begin_checksum_read(io) + + signature = jlread(cio, UInt32) + signature == V2_BTREE_LEAF_NODE_SIGNATURE || throw(InvalidDataException("Signature does not match.")) + version = jlread(cio, UInt8) + type = jlread(cio, UInt8) + records = map(1:num_records) do n + read_record(cio, type, hh) + end + + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException()) + BTreeLeafNodeV2(offset, type, records) +end + + + +function read_record(io, type, hh) + if type == 5 # link name for indexed group + hash_of_name = jlread(io, UInt32) + # read Heap id for managed object + version_type = jlread(io, UInt8) + + offbytes = hh.max_heap_size÷8 + offset =Int(to_uint64(jlread(io, UInt8, offbytes))) + lnbytes = min(hh.max_direct_block_size, hh.max_size_managed_objects) |> size_size2 + length = Int(to_uint64(jlread(io, UInt8, lnbytes))) + skip(io, 6-offbytes-lnbytes) + return BTreeType5RecordV2(hash_of_name, offset, length) + else + throw(error("Not implemented record type")) + end +end + +function read_records_in_node(f, offset, num_records, depth, bh, hh) + if depth == 0 + return read_v2btree_leaf_node(f, offset, num_records, bh, hh).records + end + + node = read_v2btree_node(f, offset, num_records, depth, bh, hh)::BTreeInternalNodeV2 + + records = [] + for n=1:num_records+1 + child_offset = node.child_nodes[n].child_node_pointer + child_records = node.child_nodes[n].num_records + records_in_child = read_records_in_node(f, child_offset, child_records, depth-1, bh, hh) + append!(records, records_in_child) + n<=num_records && (push!(records, node.records[n])) + end + return records +end + +function get_block_offset(f, iblock, roffset, hh) + block_num, block_size, block_start = block_num_size_start(roffset, hh) + K = length(iblock.dblocks) + if block_num < K + dblock = iblock.dblocks[block_num+1] + return fileoffset(f,dblock.offset) + roffset - block_start + end + sub_iblock = iblock.iblocks[block_num-K+1] + get_block_offset(f, sub_iblock, roffset-block_start, hh) +end + +function read_btree(f, offset_hh, offset_bh) + hh = read_fractal_heap_header(f, offset_hh) + bh = read_v2btree_header(f, offset_bh) + + records = read_records_in_node(f, bh.root_node_address, bh.num_records_in_root_node, bh.depth, bh, hh) + if hh.cur_num_rows_in_root_iblock > 0 + indirect_rb = read_indirect_block(f, hh.root_block_address, hh, hh.cur_num_rows_in_root_iblock) + links = map(records) do r + offset = get_block_offset(f, indirect_rb, r.offset, hh) + seek(f.io, offset) + read_link(f.io) + end + else # there's only a single direct block at hh.root_block_address + links = map(records) do r + offset = fileoffset(f,hh.root_block_address) + r.offset + seek(f.io, offset) + read_link(f.io) + end + end + links +end + +########################################################################################### +## Old Style Group: V1 B-Tree & Name Index Heap ## +########################################################################################### + +function read_oldstyle_group(f, v1btree_address, name_index_heap) + local_heap = read_local_heap_header(f, name_index_heap) + links = read_v1btree(f, v1btree_address) + map(links) do link + link_name = read_in_local_heap(f, local_heap, link.link_name_offset) + (link_name, link.obj_header_address) + end +end + +const LOCAL_HEAP_SIGNATURE = htol(0x50414548) # UInt8['H', 'E', 'A', 'P'] +function read_local_heap_header(f, offset) + io = f.io + seek(io, fileoffset(f, offset)) + + signature = jlread(io, UInt32) + signature == LOCAL_HEAP_SIGNATURE || throw(InvalidDataException("Signature does not match.")) + + version = jlread(io, UInt8) + version == 0 || throw(UnsupportedVersionException("Local heap with version $version detected.")) + skip(io, 3) + data_segment_size = jlread(io, Length) + + # This field is important for computing where to add to the heap. Let's ignore that + offset_head_free_list = jlread(io, Length) + data_segment_offset = jlread(io, RelOffset) + (; offset=data_segment_offset, size=data_segment_size) +end + +function read_in_local_heap(f, local_heap, pos) + io = f.io + offset = local_heap.offset + pos + seek(io, fileoffset(f, offset)) + return read_bytestring(io) +end + +const V1_BTREE_NODE_SIGNATURE = htol(0x45455254) # UInt8['T', 'R', 'E', 'E'] +function read_v1btree(f, offset) + io = f.io + seek(io, fileoffset(f, offset)) + + signature = jlread(io, UInt32) + signature == V1_BTREE_NODE_SIGNATURE || throw(InvalidDataException("Signature does not match.")) + + # 0 for internal node, 1 for chunked datasets + node_type = jlread(io, UInt8) + node_type == 0 || throw(InvalidDataException("Expected a v1 btree for group nodes")) + # level of node. 0 implies leaf node + node_level = jlread(io, UInt8) + # how many entries are used + entries_used = jlread(io, UInt16) + # maximum value appears to be the one from superblock + # but this is irrelevant for reading + left_sibling = jlread(io, RelOffset) + right_sibling = jlread(io, RelOffset) + links = [] + keys = [] + children = RelOffset[] + for _ = 1:entries_used + push!(keys, jlread(io, Length)) + push!(children, jlread(io, RelOffset)) + end + push!(keys, jlread(io, Length)) + + for child in children + if node_level > 0 + append!(links, read_v1btree(f, child)) + else + append!(links, read_symbol_table_node(f, child)) + end + end + return links +end + +function read_v1btree_dataset_chunks(f, offset, dimensionality) + io = f.io + seek(io, fileoffset(f, offset)) + + signature = jlread(io, UInt32) + signature == V1_BTREE_NODE_SIGNATURE || throw(InvalidDataException("Signature does not match.")) + + # 0 for internal node, 1 for chunked datasets + node_type = jlread(io, UInt8) + node_type == 1 || throw(InvalidDataException("Expected a v1 btree for dataset chunks")) + # level of node. 0 implies leaf node + node_level = jlread(io, UInt8) + # how many entries are used + entries_used = jlread(io, UInt16) + # maximum value appears to be the one from superblock + # but this is irrelevant for reading + left_sibling = jlread(io, RelOffset) + right_sibling = jlread(io, RelOffset) + children = Any[] + for _ = 1:entries_used + chunk_size = Int(jlread(io, UInt32)) + filter_mask = Int(jlread(io, UInt32)) + index = jlread(io, UInt64, dimensionality) + push!(children, (offset=jlread(io, RelOffset), node_level, chunk_size, filter_mask, idx=tuple(Int.(index)...))) + end + + chunks = Any[] + for child in children + if child.node_level > 0 + append!(chunks, read_v1btree_dataset_chunks(f, child.offset, dimensionality)) + else + push!(chunks, child) + end + end + return chunks +end + + +const SYMBOL_TABLE_NODE_SIGNATURE = htol(0x444f4e53) # UInt8['S', 'N', 'O', 'D'] + +function read_symbol_table_node(f, offset) + io = f.io + seek(io, fileoffset(f, offset)) + + signature = jlread(io, UInt32) + signature == SYMBOL_TABLE_NODE_SIGNATURE || throw(InvalidDataException("Signature does not match.")) + + version = jlread(io, UInt8) + skip(io, 1) + num_symbols = jlread(io, UInt16) + links = [] + + for _=1:num_symbols + link_name_offset = jlread(io, Length) # RelOffset but this is probably wrong + obj_header_address = jlread(io, RelOffset) + skip(io, 24) + push!(links, (; link_name_offset, obj_header_address)) + end + return links +end \ No newline at end of file diff --git a/src/global_heaps.jl b/src/global_heaps.jl index dcc130ed..6252edab 100644 --- a/src/global_heaps.jl +++ b/src/global_heaps.jl @@ -106,10 +106,16 @@ function jlread(io::IO, ::Type{GlobalHeap}) startpos = position(io) free = heapsz while free > 8 + jlsizeof(Length) - push!(objects, position(io)) + curpos = position(io) objidx = jlread(io, UInt16) objidx == 0 && break - objidx == index || throw(UnsupportedFeatureException()) + if objidx > index + append!(objects, fill(typemax(Int), objidx-index)) + index = objidx + elseif objidx < index + throw(InvalidDataException("Encountered unordered list of global heap objects.")) + end + push!(objects, curpos) skip(io, 6) # Reference count and reserved sz = jlread(io, Length) # Length skip(io, sz + 8 - mod1(sz, 8)) # Payload diff --git a/src/groups.jl b/src/groups.jl index 55fe912d..01ab9c10 100644 --- a/src/groups.jl +++ b/src/groups.jl @@ -203,16 +203,29 @@ function Base.keys(g::Group) end Base.keytype(f::Group) = String - -struct LinkInfo - version::UInt8 - flags::UInt8 - fractal_heap_address::RelOffset - name_index_btree::RelOffset + +# LinkInfo struct is used for dispatch +struct LinkInfo end +# In general the size depends on flags +# when writing files we always use 18 bytes +# this function is only used for writing +jlsizeof(::Type{LinkInfo}) = 18 +function jlwrite(io, ::LinkInfo) + jlwrite(io, zero(UInt16)) + jlwrite(io, typemax(UInt64)) + jlwrite(io, typemax(UInt64)) + return nothing end -define_packed(LinkInfo) -LinkInfo() = LinkInfo(0, 0, UNDEFINED_ADDRESS, UNDEFINED_ADDRESS) +function jlread(io, ::Type{LinkInfo}) + version = jlread(io, UInt8) + flags = jlread(io, UInt8) + (flags & 0b1) == 1 && skip(io, 8) + fractal_heap_address = jlread(io, RelOffset) + name_index_btree = jlread(io, RelOffset) + (flags & 0b10) == 0b10 && skip(io, 8) + (; version, flags, fractal_heap_address, name_index_btree) +end @enum(CharacterSet, CSET_ASCII, @@ -256,41 +269,73 @@ const CONTINUATION_MSG_SIZE = jlsizeof(HeaderMessage) + jlsizeof(RelOffset) + jl function load_group(f::JLDFile, roffset::RelOffset) io = f.io - chunk_start_offset::Int64 = fileoffset(f, roffset) - seek(io, chunk_start_offset) - - cio = begin_checksum_read(io) - sz = read_obj_start(cio) - chunk_checksum_offset::Int64 = position(cio) + sz - + chunk_start::Int64 = fileoffset(f, roffset) + seek(io, chunk_start) + + header_version = jlread(io, UInt8) + if header_version == 1 + seek(io, chunk_start) + cio = io + sz,_,groupflags = read_obj_start(cio) + chunk_end = position(cio) + sz + # Skip to nearest 8byte aligned position + skip_to_aligned!(cio, chunk_start) + else + header_version = 2 + seek(io, chunk_start) + cio = begin_checksum_read(io) + sz,_,groupflags = read_obj_start(cio) + chunk_end = position(cio) + sz + end # Messages + chunk_end::Int64 continuation_message_goes_here::Int64 = -1 links = OrderedDict{String,RelOffset}() + chunks = [(; chunk_start, chunk_end)] + chunk_number = 0 - continuation_offset::Int64 = -1 - continuation_length::Length = 0 next_link_offset::Int64 = -1 + link_phase_change_max_compact::Int64 = -1 + link_phase_change_min_dense::Int64 = -1 est_num_entries::Int64 = 4 est_link_name_len::Int64 = 8 - - while true - if continuation_offset != -1 - seek(io, continuation_offset) - chunk_checksum_offset = continuation_offset + continuation_length - 4 - continuation_offset = -1 - - cio = begin_checksum_read(io) - jlread(cio, UInt32) == OBJECT_HEADER_CONTINUATION_SIGNATURE || throw(InvalidDataException()) + fractal_heap_address = UNDEFINED_ADDRESS + name_index_btree = UNDEFINED_ADDRESS + + v1btree_address = UNDEFINED_ADDRESS + name_index_heap = UNDEFINED_ADDRESS + + while !isempty(chunks) + chunk = popfirst!(chunks) + chunk_start = chunk.chunk_start + chunk_end = chunk.chunk_end + + if chunk_number > 0 + seek(io, chunk_start) + chunk_end -= 4 + if header_version == 2 + cio = begin_checksum_read(io) + jlread(cio, UInt32) == OBJECT_HEADER_CONTINUATION_SIGNATURE || throw(InvalidDataException()) + end end - - while (curpos = position(cio)) <= chunk_checksum_offset - 4 - msg = jlread(cio, HeaderMessage) - endpos = curpos + jlsizeof(HeaderMessage) + msg.size + chunk_number += 1 + while (curpos = position(cio)) < chunk_end-4 + if header_version == 1 + # Message start 8byte aligned relative to object start + skip_to_aligned!(cio, chunk_start) + # Version 1 header message is padded + msg = HeaderMessage(jlread(cio, UInt16), jlread(cio, UInt16), jlread(cio, UInt8)) + skip(cio, 3) + else # header_version == 2 + msg = jlread(cio, HeaderMessage) + (groupflags & 4) == 4 && skip(cio, 2) + end + endpos = position(cio) + msg.size if msg.msg_type == HM_NIL if continuation_message_goes_here == -1 && - chunk_checksum_offset - curpos == CONTINUATION_MSG_SIZE + chunk_end - curpos == CONTINUATION_MSG_SIZE continuation_message_goes_here = curpos - elseif endpos + CONTINUATION_MSG_SIZE == chunk_checksum_offset + elseif endpos + CONTINUATION_MSG_SIZE == chunk_end # This is the remaining space at the end of a chunk # Use only if a message can potentially fit inside # Single Character Name Link Message has 13 bytes payload @@ -302,25 +347,37 @@ function load_group(f::JLDFile, roffset::RelOffset) continuation_message_goes_here = -1 if msg.msg_type == HM_LINK_INFO link_info = jlread(cio, LinkInfo) - link_info.fractal_heap_address == UNDEFINED_ADDRESS || throw(UnsupportedFeatureException()) + fractal_heap_address = link_info.fractal_heap_address + name_index_btree = link_info.name_index_btree elseif msg.msg_type == HM_GROUP_INFO if msg.size > 2 # Version Flag - jlread(cio, UInt8) == 0 || throw(UnsupportedFeatureException()) - # Verify that non-default group size is given - jlread(cio, UInt8) == 2 || throw(UnsupportedFeatureException()) - est_num_entries = jlread(cio, UInt16) - est_link_name_len = jlread(cio, UInt16) + jlread(io, UInt8) == 0 || throw(UnsupportedFeatureException()) + flag = jlread(io, UInt8) + if flag%2 == 1 # first bit set + link_phase_change_max_compact = jlread(io, UInt16) + link_phase_change_min_dense = jlread(io, UInt16) + end + if (flag >> 1)%2 == 1 # second bit set + # Verify that non-default group size is given + est_num_entries = jlread(io, UInt16) + est_link_name_len = jlread(io, UInt16) + end end elseif msg.msg_type == HM_LINK_MESSAGE name, loffset = read_link(cio) links[name] = loffset elseif msg.msg_type == HM_OBJECT_HEADER_CONTINUATION - continuation_offset = chunk_start_offset = fileoffset(f, jlread(cio, RelOffset)) - continuation_length = jlread(cio, Length) + cont_chunk_start = fileoffset(f, jlread(cio, RelOffset)) + chunk_length = jlread(cio, Length) + push!(chunks, (;chunk_start=cont_chunk_start, + chunk_end =cont_chunk_start+chunk_length)) # For correct behaviour, empty space can only be filled in the # very last chunk. Forget about previously found empty space next_link_offset = -1 + elseif msg.msg_type == HM_SYMBOL_TABLE + v1btree_address = jlread(cio, RelOffset) + name_index_heap = jlread(cio, RelOffset) elseif (msg.flags & 2^3) != 0 throw(UnsupportedFeatureException()) end @@ -329,14 +386,28 @@ function load_group(f::JLDFile, roffset::RelOffset) end # Checksum - seek(cio, chunk_checksum_offset) - end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException()) + seek(cio, chunk_end) + if header_version == 2 + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException()) + end + end - continuation_offset == -1 && break + if fractal_heap_address != UNDEFINED_ADDRESS + records = read_btree(f, fractal_heap_address, name_index_btree) + for r in records + links[r[1]] = r[2] + end + end + + if v1btree_address != UNDEFINED_ADDRESS + records = read_oldstyle_group(f, v1btree_address, name_index_heap) + for r in records + links[r[1]] = r[2] + end end - Group{typeof(f)}(f, chunk_start_offset, continuation_message_goes_here, - chunk_checksum_offset, next_link_offset, est_num_entries, + Group{typeof(f)}(f, chunk_start, continuation_message_goes_here, + chunk_end, next_link_offset, est_num_entries, est_link_name_len, OrderedDict{String,RelOffset}(), OrderedDict{String,Group}(), links) end diff --git a/src/inlineunion.jl b/src/inlineunion.jl index fca712f3..2a00c449 100644 --- a/src/inlineunion.jl +++ b/src/inlineunion.jl @@ -48,18 +48,17 @@ end # exept for the ReadRepresentation and the very last line where the data is # converted back into a Union Array function read_array(f::JLDFile, dataspace::ReadDataspace, - rr::ReadRepresentation{InlineUnionEl{T1,T2},RR}, data_length::Int, - filter_id::UInt16, header_offset::RelOffset, + rr::ReadRepresentation{InlineUnionEl{T1,T2},RR}, layout::DataLayout, + filters::FilterPipeline, header_offset::RelOffset, attributes::Union{Vector{ReadAttribute},Nothing}) where {T1, T2,RR} io = f.io - data_offset = position(io) ndims, offset = get_ndims_offset(f, dataspace, attributes) seek(io, offset) v = construct_array(io, InlineUnionEl{T1,T2}, Val(Int(ndims))) n = length(v) - seek(io, data_offset) - if !iszero(filter_id) - read_compressed_array!(v, f, rr, data_length, filter_id) + seek(io, layout.data_offset) + if iscompressed(filters) + read_compressed_array!(v, f, rr, layout.data_length, filters) else read_array!(v, f, rr) end diff --git a/src/misc.jl b/src/misc.jl index fff927d1..79f372fd 100644 --- a/src/misc.jl +++ b/src/misc.jl @@ -100,9 +100,65 @@ function size_size(sz::Integer) end end +# Get the size of the size +function size_size2(sz::Integer) + if sz < 2^8 + 1 + elseif sz < 2^16 + 2 + elseif sz < 2^24 + 3 + elseif sz < 2^32 + 4 + elseif sz < 2^40 + 5 + elseif sz < 2^48 + 6 + elseif sz < 2^56 + 7 + else + 8 + end +end + + """ symbol_length(x::Symbol) Returns the length of the string represented by `x`. """ symbol_length(x::Symbol) = ccall(:strlen, Int, (Cstring,), x) + +function uintofsize(sz) + if sz == 1 + UInt8 + elseif sz == 2 + UInt16 + elseif sz == 4 + UInt32 + else + UInt64 + end +end + +function to_uint64(bts::Vector{UInt8}) + bts2 = [bts; zeros(UInt8, 8-length(bts))] + u = zero(UInt64) + for b in reverse(bts2) + u = u << 8 + u += b + end + u +end + +""" + skip_to_aligned!(io, rel=0) + +Skip to nearest position aligned to a multiple of 8 bytes relative to `rel`. +""" +function skip_to_aligned!(io, rel=0) + pos = position(io) + pos += 8 - mod1(pos-rel, 8) + seek(io, pos) + return nothing +end \ No newline at end of file diff --git a/src/object_headers.jl b/src/object_headers.jl index 7a23f309..85c093f7 100644 --- a/src/object_headers.jl +++ b/src/object_headers.jl @@ -25,6 +25,31 @@ const HM_DRIVER_INFO = 0x14 const HM_ATTRIBUTE_INFO = 0x15 const HM_REFERENCE_COUNT = 0x16 +MESSAGE_TYPES = Dict( + 0x00 => "HM_NIL", + 0x01 => "HM_DATASPACE", + 0x02 => "HM_LINK_INFO", + 0x03 => "HM_DATATYPE", + 0x04 => "HM_FILL_VALUE_OLD", + 0x05 => "HM_FILL_VALUE", + 0x06 => "HM_LINK_MESSAGE", + 0x07 => "HM_EXTERNAL_FILE_LIST", + 0x08 => "HM_DATA_LAYOUT", + 0x09 => "HM_BOGUS", + 0x0a => "HM_GROUP_INFO", + 0x0b => "HM_FILTER_PIPELINE", + 0x0c => "HM_ATTRIBUTE", + 0x0d => "HM_OBJECT_COMMENT", + 0x0f => "HM_SHARED_MESSAGE_TABLE", + 0x10 => "HM_OBJECT_HEADER_CONTINUATION", + 0x11 => "HM_SYMBOL_TABLE", + 0x12 => "HM_MODIFICATION_TIME", + 0x13 => "HM_BTREE_K_VALUES", + 0x14 => "HM_DRIVER_INFO", + 0x15 => "HM_ATTRIBUTE_INFO", + 0x16 => "HM_REFERENCE_COUNT", + ) + const OH_ATTRIBUTE_CREATION_ORDER_TRACKED = 2^2 const OH_ATTRIBUTE_CREATION_ORDER_INDEXED = 2^3 const OH_ATTRIBUTE_PHASE_CHANGE_VALUES_STORED = 2^4 @@ -42,21 +67,48 @@ define_packed(ObjectStart) # Reads the start of an object including the signature, version, flags, # and (payload) size. Returns the size. +# function read_obj_start(io::IO) +# os = jlread(io, ObjectStart) +# os.signature == OBJECT_HEADER_SIGNATURE || throw(InvalidDataException()) +# os.version == 2 || throw(UnsupportedVersionException()) + +# if (os.flags & OH_TIMES_STORED) != 0 +# # Skip access, modification, change and birth times +# skip(io, 128) +# end +# if (os.flags & OH_ATTRIBUTE_PHASE_CHANGE_VALUES_STORED) != 0 +# # Skip maximum # of attributes fields +# skip(io, 32) +# end + +# read_size(io, os.flags) +# end + function read_obj_start(io::IO) + curpos = position(io) os = jlread(io, ObjectStart) - os.signature == OBJECT_HEADER_SIGNATURE || throw(InvalidDataException()) - os.version == 2 || throw(UnsupportedVersionException()) + if os.version == 2 && os.signature == OBJECT_HEADER_SIGNATURE + if (os.flags & OH_TIMES_STORED) != 0 + # Skip access, modification, change and birth times + skip(io, 16) + end + if (os.flags & OH_ATTRIBUTE_PHASE_CHANGE_VALUES_STORED) != 0 + # Skip maximum # of attributes fields + skip(io, 4) + end - if (os.flags & OH_TIMES_STORED) != 0 - # Skip access, modification, change and birth times - skip(io, 128) - end - if (os.flags & OH_ATTRIBUTE_PHASE_CHANGE_VALUES_STORED) != 0 - # Skip maximum # of attributes fields - skip(io, 32) + return read_size(io, os.flags), 2, os.flags + else + seek(io, curpos) + version = jlread(io, UInt8) + version == 1 || throw(error("This should not have happened")) + + skip(io, 1) + num_messages = jlread(io, UInt16) + obj_ref_count = jlread(io, UInt32) + obj_header_size = jlread(io, UInt32) + return obj_header_size, 1, os.flags end - - read_size(io, os.flags) end struct HeaderMessage @@ -69,19 +121,353 @@ define_packed(HeaderMessage) function isgroup(f::JLDFile, roffset::RelOffset) io = f.io - seek(io, fileoffset(f, roffset)) - - sz = read_obj_start(io) - pmax::Int64 = position(io) + sz - while position(io) <= pmax-4 - msg = jlread(io, HeaderMessage) - endpos = position(io) + msg.size - if msg.msg_type == HM_LINK_INFO || msg.msg_type == HM_GROUP_INFO || msg.msg_type == HM_LINK_MESSAGE - return true - elseif msg.msg_type == HM_DATASPACE || msg.msg_type == HM_DATATYPE || msg.msg_type == HM_FILL_VALUE || msg.msg_type == HM_DATA_LAYOUT - return false + chunk_start = fileoffset(f, roffset) + seek(io, chunk_start) + + sz, version, = read_obj_start(io) + chunk_end::Int64 = position(io) + sz + if version == 2 + while position(io) <= chunk_end-4 + msg = jlread(io, HeaderMessage) + endpos = position(io) + msg.size + if msg.msg_type == HM_LINK_INFO || msg.msg_type == HM_GROUP_INFO || msg.msg_type == HM_LINK_MESSAGE || msg.msg_type == HM_SYMBOL_TABLE + return true + elseif msg.msg_type == HM_DATASPACE || msg.msg_type == HM_DATATYPE || msg.msg_type == HM_FILL_VALUE || msg.msg_type == HM_DATA_LAYOUT + return false + end + seek(io, endpos) + end + elseif version == 1 + chunks = [(; chunk_start, chunk_end)] + chunk_number = 0 + skip_to_aligned!(io, chunk_start) + + while !isempty(chunks) + chunk = popfirst!(chunks) + chunk_start = chunk.chunk_start + chunk_end = chunk.chunk_end + + if chunk_number > 0 + seek(io, chunk_start) + end + chunk_number += 1 + + while position(io) < chunk_end - 4 + # Message start 8byte aligned relative to object start + skip_to_aligned!(io, chunk_start) + # Version 1 header message is padded + msg = HeaderMessage(jlread(io, UInt16), jlread(io, UInt16), jlread(io, UInt8)) + skip(io, 3) + endpos = position(io) + msg.size + + if msg.msg_type in (HM_LINK_INFO, HM_GROUP_INFO, HM_LINK_MESSAGE, HM_SYMBOL_TABLE) + return true + elseif msg.msg_type in (HM_DATASPACE, HM_DATATYPE, HM_FILL_VALUE, HM_DATA_LAYOUT) + return false + elseif msg.msg_type == HM_OBJECT_HEADER_CONTINUATION + cont_chunk_start = fileoffset(f, jlread(io, RelOffset)) + chunk_length = jlread(io, Length) + push!(chunks, (; chunk_start=cont_chunk_start, chunk_end=cont_chunk_start+chunk_length)) + end + seek(io, endpos) + end end - seek(io, endpos) end return false end + +# code below is work in progress and for debugging +struct Message + type::UInt8 + header + fields +end + + +function read_link_info(io, msg_header, OffsetType) + @assert msg_header.msg_type == HM_LINK_INFO + version = jlread(io, UInt8) + flags = jlread(io, UInt8) + # Maximum Creation index + # exists if bit 0 of flag is set + if (flags & 0x1) == 0x1 + max_creation_index = jlread(io, UInt64) + else + max_creation_index = typemax(UInt64) + end + fractal_heap_address = jlread(io, OffsetType) + v2btree_name_index = jlread(io, OffsetType) + if (flags & 0x1) == 0x1 + v2btree_creation_index = jlread(io, OffsetType) + else + v2btree_creation_index = UNDEFINED_ADDRESS + end + return Message(HM_LINK_INFO,msg_header, (; + version, + flags, + fractal_heap_address, + v2btree_creation_index, + v2btree_name_index)) +end + +function print_header_messages(f::JLDFile, name::AbstractString) + if isempty(name) || name == "/" + print_header_messages(f, f.root_group_offset) + else + print_header_messages(f.root_group,name) + end +end + +function print_header_messages(g::Group, name::AbstractString) + f = g.f + f.n_times_opened == 0 && throw(ArgumentError("file is closed")) + (g, name) = pathize(g, name, false) + roffset = lookup_offset(g, name) + roffset != UNDEFINED_ADDRESS || throw(ArgumentError("did not find a group or dataset named \"$name\"")) + print_header_messages(f, roffset) +end + + +function print_header_messages(f::JLDFile, roffset::RelOffset) + io = f.io + chunk_start::Int64 = fileoffset(f, roffset) + seek(io, chunk_start) + + # Test for V1 Obj header + + header_version = Int(jlread(io, UInt8)) + if header_version == 1 + seek(io, chunk_start) + cio = io + sz,_,groupflags = read_obj_start(cio) + chunk_end = position(cio) + sz + # Skip to nearest 8byte aligned position + skip_to_aligned!(cio, chunk_start) + else + println("Object Header Message Version 2") + header_version = 2 + seek(io, chunk_start) + cio = begin_checksum_read(io) + sz,_,groupflags = read_obj_start(cio) + chunk_end = position(cio) + sz + @info "chunk 0" position(cio) sz chunk_end + end + # Messages + continuation_message_goes_here::Int64 = -1 + links = OrderedDict{String,RelOffset}() + chunks = [(; chunk_start, chunk_end)] + chunk_number = 0 + next_link_offset::Int64 = -1 + link_phase_change_max_compact::Int64 = -1 + link_phase_change_min_dense::Int64 = -1 + est_num_entries::Int64 = 4 + est_link_name_len::Int64 = 8 + chunk_end::Int64 + attrs = EMPTY_READ_ATTRIBUTES + while !isempty(chunks) + chunk = popfirst!(chunks) + chunk_start = chunk.chunk_start + chunk_end = chunk.chunk_end + + @info "Starting to read chunk no $chunk_number of length $(chunk_end-chunk_start)" + if chunk_number > 0 # Don't do this the first time around + seek(io, chunk_start) + if header_version == 2 + chunk_end -= 4 + cio = begin_checksum_read(io) + jlread(cio, UInt32) == OBJECT_HEADER_CONTINUATION_SIGNATURE || throw(InvalidDataException()) + end + end + chunk_number += 1 + @info "positions" position(cio) chunk_start chunk_end + while (curpos = position(cio)) < chunk_end-4 + if header_version == 1 + skip_to_aligned!(cio, chunk_start) + # Version 1 header message is padded + msg = HeaderMessage(jlread(cio, UInt16), jlread(cio, UInt16), jlread(cio, UInt8)) + skip(cio, 3) + else # version == 2 + msg = jlread(cio, HeaderMessage) + (groupflags & 4) == 4 && skip(cio, 2) + end + endpos = position(cio) + msg.size + println(""" + Message: $(MESSAGE_TYPES[msg.msg_type]) ($(msg.msg_type)) + size: $(msg.size) + flags: $(msg.flags) + at pos $(position(cio)-chunk_start)""") + if msg.msg_type == HM_NIL + if continuation_message_goes_here == -1 && + chunk_end - curpos == CONTINUATION_MSG_SIZE + continuation_message_goes_here = curpos + elseif endpos + CONTINUATION_MSG_SIZE == chunk_end + # This is the remaining space at the end of a chunk + # Use only if a message can potentially fit inside + # Single Character Name Link Message has 13 bytes payload + if msg.size >= 13 + next_link_offset = curpos + end + end + else + continuation_message_goes_here = -1 + if msg.msg_type == HM_LINK_INFO + fullmsg = read_link_info(cio, msg, RelOffset) + for (k,v) in pairs(fullmsg.fields) + println(" $k: $v") + end + elseif msg.msg_type == HM_GROUP_INFO + if msg.size > 2 + # Version Flag + jlread(io, UInt8) == 0 || throw(UnsupportedFeatureException()) + flag = jlread(io, UInt8) + if flag%2 == 1 # first bit set + link_phase_change_max_compact = jlread(io, UInt16) + link_phase_change_min_dense = jlread(io, UInt16) + println(" link_phase_change_max_compact = $link_phase_change_max_compact") + println(" link_phase_change_min_dense = $link_phase_change_min_dense") + end + if (flag >> 1)%2 == 1 # second bit set + # Verify that non-default group size is given + est_num_entries = jlread(io, UInt16) + est_link_name_len = jlread(io, UInt16) + println(" est_num_entries = $est_num_entries") + println(" est_link_name_len = $est_link_name_len") + end + end + elseif msg.msg_type == HM_LINK_MESSAGE + name, loffset = read_link(cio) + links[name] = loffset + println(" name = \"$name\"") + println(" offset = $(Int(loffset.offset))") + elseif msg.msg_type == HM_OBJECT_HEADER_CONTINUATION + continuation_offset = fileoffset(f, jlread(cio, RelOffset)) + continuation_length = jlread(cio, Length) + push!(chunks, (; chunk_start = continuation_offset, + chunk_end = continuation_offset + continuation_length)) + println(""" offset = $(continuation_offset)\n length = $(continuation_length)""") + println("pos=$(position(cio)) $chunk_end") + elseif msg.msg_type == HM_DATASPACE + dataspace = read_dataspace_message(cio) + println(" $dataspace") + elseif msg.msg_type == HM_DATATYPE + datatype_class, datatype_offset = read_datatype_message(cio, f, (msg.flags & 2) == 2) + println(""" class: $datatype_class\n offset: $datatype_offset""") + elseif msg.msg_type == HM_FILL_VALUE_OLD + #(jlread(cio, UInt8) == 3 && jlread(cio, UInt8) == 0x09) || throw(UnsupportedFeatureException()) + elseif msg.msg_type == HM_FILL_VALUE + #(jlread(cio, UInt8) == 3 && jlread(cio, UInt8) == 0x09) || throw(UnsupportedFeatureException()) + elseif msg.msg_type == HM_DATA_LAYOUT + layout = jlread(cio, DataLayout, f) + @info layout + elseif msg.msg_type == HM_FILTER_PIPELINE + filter_pipeline = jlread(cio, FilterPipeline) + @info filter_pipeline + elseif msg.msg_type == HM_SYMBOL_TABLE + v1_btree_address = jlread(cio, RelOffset) + local_heap_address = jlread(cio, RelOffset) + println(""" required for \"old style" groups\n v1 B-Tree Adress: $(v1_btree_address)\n Local Heap Adress: $(local_heap_address)""") + elseif msg.msg_type == HM_ATTRIBUTE + if attrs === EMPTY_READ_ATTRIBUTES + attrs = ReadAttribute[read_attribute(cio, f)] + else + push!(attrs, read_attribute(cio, f)) + end + attr = attrs[end] + println(""" name: \"$(attr.name)\" """) + if attr.datatype_class != 0xff + println(""" datatype: $(DATATYPES[attr.datatype_class%16])""") + else + println(""" datatype: committed at $(attr.datatype_offset)""") + end + #try + data = read_attr_data(f, attr) + println(""" data: "$data" """) + #= catch e + println(""" loading data failed""") + end =# + elseif (msg.flags & 2^3) != 0 + throw(UnsupportedFeatureException()) + end + end + seek(cio, endpos) + end + + # Checksum + seek(cio, chunk_end) + if header_version == 2 + end_checksum(cio) == jlread(io, UInt32) || throw(InvalidDataException()) + end + end + nothing +end + + +struct DataLayout + version::UInt8 + storage_type::UInt8 + data_length::Int64 + data_offset::Int64 + dimensionality::UInt8 + chunk_indexing_type::UInt8 # only in version 4 + chunk_dimensions::Vector{UInt32} # only defined if dimensionality > 0 + DataLayout(version, storage_type, data_length, data_offset) = + new(version, storage_type, data_length, data_offset, 0, 0) + DataLayout(version, storage_type, data_length, data_offset, dimensionality, chunk_indexing_type, chunk_dimensions) = + new(version, storage_type, data_length, data_offset, dimensionality, chunk_indexing_type, chunk_dimensions) +end + +ischunked(dl::DataLayout) = dl.storage_type == 2 + +function jlread(cio, ::Type{DataLayout}, f) + version = jlread(cio, UInt8) + if version == 4 || version == 3 + storage_type = jlread(cio, UInt8) + if storage_type == LC_COMPACT_STORAGE + data_length = jlread(cio, UInt16) + data_offset = position(cio) + return DataLayout(version, storage_type, data_length, data_offset) + elseif storage_type == LC_CONTIGUOUS_STORAGE + rf = jlread(cio, RelOffset) + data_offset = rf != UNDEFINED_ADDRESS ? fileoffset(f, rf) : typemax(Int64) + data_length = jlread(cio, Length) + DataLayout(version, storage_type, data_length, data_offset) + elseif version == 4 && storage_type == LC_CHUNKED_STORAGE + # TODO: validate this + flags = jlread(cio, UInt8) + dimensionality = jlread(cio, UInt8) + dimensionality_size = jlread(cio, UInt8) + #skip(cio, Int(dimensionality)*Int(dimensionality_size)) + chunk_dimensions = [read_nb_uint(cio, dimensionality_size) for _=1:dimensionality] + chunk_indexing_type = jlread(cio, UInt8) + @info "chunk dims" tuple(chunk_dimensions...) dimensionality dimensionality_size chunk_indexing_type + chunk_indexing_type == 1 || throw(UnsupportedFeatureException("Unknown chunk indexing type")) + data_length = jlread(cio, Length) + jlread(cio, UInt32) + data_offset = fileoffset(f, jlread(cio, RelOffset)) + chunked_storage = true + DataLayout(version, storage_type, data_length, data_offset, dimensionality, chunk_indexing_type, chunk_dimensions) + + elseif version == 3 && storage_type == LC_CHUNKED_STORAGE + dimensionality = jlread(cio, UInt8) + rf = jlread(cio, RelOffset) + data_offset = rf != UNDEFINED_ADDRESS ? fileoffset(f, rf) : typemax(Int64) + chunk_dimensions = jlread(cio, UInt32, dimensionality-1) + data_length = jlread(cio, UInt32) + chunked_storage = true + DataLayout(version, storage_type, data_length, data_offset, dimensionality, 0, chunk_dimensions) + else + throw(UnsupportedFeatureException("Unknown data layout")) + end + else + throw(UnsupportedVersionException("Data layout message version $version is not supported")) + end +end + +function read_nb_uint(io::IO, nb) + val = zero(UInt) + for n = 1:nb + #val = val << 8 + val += jlread(io, UInt8)*(2^(8n)) + end + val +end \ No newline at end of file diff --git a/src/superblock.jl b/src/superblock.jl index 22c1fd71..12de54a6 100644 --- a/src/superblock.jl +++ b/src/superblock.jl @@ -4,20 +4,11 @@ const SUPERBLOCK_SIGNATURE = htol(0x0a1a0a0d46444889) # UInt8[0o211, 'H', 'D', 'F', '\r', '\n', 0o032, '\n'] -# https://www.hdfgroup.org/HDF5/doc/H5.format.html#FileMetaData -# Superblock (Version 2) -struct Superblock - file_consistency_flags::UInt8 - base_address::Int64 - superblock_extension_address::RelOffset - end_of_file_address::Int64 - root_group_object_header_address::RelOffset -end +# Data starts after file header and after superblock +const DATA_START = FILE_HEADER_LENGTH + (12+8*4+4) -jlsizeof(::Union{Type{Superblock},Superblock}) = - 12+jlsizeof(RelOffset)*4+4 -function jlread(io::IO, ::Type{Superblock}) +function read_superblock(io::IO) cio = begin_checksum_read(io) # Signature @@ -26,40 +17,87 @@ function jlread(io::IO, ::Type{Superblock}) # Version version = jlread(cio, UInt8) - version == 2 || throw(UnsupportedVersionException()) + if version == 0 + version_free_space_storage = jlread(cio, UInt8) # has to be zero + version_root_group_symbol_table_enty = jlread(cio, UInt8) # has to be zero + jlread(cio, UInt8) + version_share_header_msg_format = jlread(cio, UInt8) # has to be zero + size_of_offsets = jlread(cio, UInt8) + size_of_lengths = jlread(cio, UInt8) + size_of_lengths == 8 && size_of_offsets == 8 || throw(UnsupportedFeatureException("Only files with length and offset size of 8 bytes are supported.")) + jlread(cio, UInt8) + group_leaf_node_k = jlread(cio, UInt16) # must be greater than zero + group_internal_node_k = jlread(cio, UInt16) # must be greater than zero + # Unused File consistency flags + jlread(cio, UInt32) + #indexed_storage_internal_node_k = jlread(cio, UInt16) # must be greater than zero + #jlread(cio, UInt16) + base_address = jlread(cio, UInt64) # base adress for offsets within file (also absolute address of superblock) + adress_free_space_info = jlread(cio, RelOffset) # Undefined Adress + end_of_file_address = jlread(cio, UInt64) # absolute adress of first byte past end of data + driver_info_block_adress = jlread(cio, RelOffset) # undefined of relative adress of driver info block + #root_group_symbol_table_entry = jlread(cio, UInt32) # symbol table entry of root group + + link_name_offset = jlread(cio, RelOffset) + root_group_object_header_address = jlread(cio, RelOffset) + cachetype = jlread(cio, UInt32) + reserved = jlread(cio, UInt32) + scratchspace = jlread(cio, UInt128) + + # Discard Checksum + end_checksum(cio) - # Size of offsets and size of lengths - size_of_offsets = jlread(cio, UInt8) - size_of_lengths = jlread(cio, UInt8) - (size_of_offsets == 8 && size_of_lengths == 8) || throw(UnsupportedFeatureException()) + (; version, base_address, end_of_file_address, root_group_object_header_address) + elseif version == 2 || version == 3 + + # Size of offsets and size of lengths + size_of_offsets = jlread(cio, UInt8) + size_of_lengths = jlread(cio, UInt8) + (size_of_offsets == 8 && size_of_lengths == 8) || throw(UnsupportedFeatureException("Only files with length and offset size of 8 bytes are supported.")) - # File consistency flags - file_consistency_flags = jlread(cio, UInt8) + # File consistency flags + file_consistency_flags = jlread(cio, UInt8) - # Addresses - base_address = jlread(cio, Int64) - superblock_extension_address = jlread(cio, RelOffset) - end_of_file_address = jlread(cio, Int64) - root_group_object_header_address = jlread(cio, RelOffset) + # Addresses + base_address = jlread(cio, UInt64) + superblock_extension_address = jlread(cio, RelOffset) + end_of_file_address = jlread(cio, UInt64) + root_group_object_header_address = jlread(cio, RelOffset) - # Checksum - cs = end_checksum(cio) - jlread(io, UInt32) == cs || throw(InvalidDataException()) + # Checksum + cs = end_checksum(cio) + jlread(io, UInt32) == cs || throw(InvalidDataException()) - Superblock(file_consistency_flags, base_address, superblock_extension_address, - end_of_file_address, root_group_object_header_address) + (; version, base_address, end_of_file_address, root_group_object_header_address) + else + throw(UnsupportedVersionException("superblock version $version is not supported.")) + end end -function jlwrite(io::IO, s::Superblock) +function write_superblock(io::IO, f) cio = begin_checksum_write(io, 8+4+4*jlsizeof(RelOffset)) jlwrite(cio, SUPERBLOCK_SIGNATURE::UInt64) # Signature jlwrite(cio, UInt8(2)) # Version jlwrite(cio, UInt8(8)) # Size of offsets jlwrite(cio, UInt8(8)) # Size of lengths - jlwrite(cio, s.file_consistency_flags::UInt8) - jlwrite(cio, s.base_address::Int64) - jlwrite(cio, s.superblock_extension_address::RelOffset) - jlwrite(cio, s.end_of_file_address::Int64) - jlwrite(cio, s.root_group_object_header_address::RelOffset) + jlwrite(cio, UInt8(0)) # file_consistency_flags + jlwrite(cio, f.base_address::UInt64) + jlwrite(cio, UNDEFINED_ADDRESS) + jlwrite(cio, UInt64(f.end_of_data)) + jlwrite(cio, f.root_group_offset::RelOffset) jlwrite(io, end_checksum(cio)) end + +function find_superblock(f) + # Search at 0, 512, 1024, 2048 ... + for offset in (0, 512, 1024, 2048, 4096) + seek(f.io, offset) + # Signature + signature = jlread(f.io, UInt64) + if signature == SUPERBLOCK_SIGNATURE + seek(f.io, offset) + return read_superblock(f.io) + end + end + throw(InvalidDataException("Did not find a Superblock.")) +end diff --git a/test/runtests.jl b/test/runtests.jl index c9e0f10c..31785049 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -28,4 +28,5 @@ include("isreconstructed.jl") include("backwards_compatibility.jl") include("inlineunion.jl") include("customserialization.jl") -include("compression.jl") \ No newline at end of file +include("compression.jl") +include("test_files.jl") \ No newline at end of file diff --git a/test/test_files.jl b/test/test_files.jl new file mode 100644 index 00000000..a738442b --- /dev/null +++ b/test/test_files.jl @@ -0,0 +1,119 @@ +using Test, JLD2 +@testset "HDF5 compat test files" begin + # These are test files copied from the HDF5.jl test suite + cd(joinpath(@__DIR__,"test_files")) do + + fn = "compound.h5" + jldopen(fn) do f + data = f["data"] + @test data[1] == data[2] + nt = data[1] + @test nt.wgt == 1.0 + @test nt.xyz == [-2.4559041161056125, 0.43236207188504794, -0.5088338908493437] + @test nt.uvw == [-0.44966656055677057, 0.6453930541533174, 0.6174688574881305] + @test nt.E == 1.1915731810042547 + end + + # Should return some enum type and load names correctly + fn = "h5ex_t_enum.h5" + jldopen(fn) do f + @test size(f["DS1"]) == (7,4) + end + + fn = "h5ex_t_array.h5" + jldopen(fn) do f + @test f["DS1"][1] == (0:-1:-4) .* [0,1,2]' + @test f["DS1"][2] == hcat(collect(0:4), ones(Int,5), collect(2:-1:-2)) + end + + fn = "h5ex_t_float.h5" + jldopen(fn) do f + @test size(f["DS1"]) == (7,4) + @test f["DS1"][9] ≈ 5/3 + end + + # Big Endian Integers are not implemented + fn = "h5ex_t_int.h5" + jldopen(fn) do f + @test f["DS1"] == [0:-1:-6 zeros(Int,7) 0:6 0:2:12] + end + + fn = "h5ex_t_objref.h5" + jldopen(fn) do f + @test f["DS1"][1] === f["G1"] + @test f["DS1"][2] === f["DS2"] + end + + fn = "h5ex_t_opaque.h5" + jldopen(fn) do f + @test f["DS1"][4].data == [0x4f, 0x50, 0x41, 0x51, 0x55, 0x45, 0x30] + end + + fn = "h5ex_t_string.h5" + jldopen(fn) do f + @test f["DS1"] == ["Parting", "is such", "sweet", "sorrow."] + end + + fn = "h5ex_t_vlen.h5" + jldopen(fn) do f + @test f["DS1"][1] == [3, 2, 1] + @test f["DS1"][2] == [1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144] + end + + fn = "h5ex_t_vlstring.h5" + jldopen(fn) do f + @test f["DS1"] == ["Parting", "is such", "sweet", "sorrow."] + end + + fn = "nullterm_ascii.h5" + jldopen(fn) do f + @test f["test"] == "Hello World" + end + + fn = "large_fractal_heap.h5" + jldopen(fn) do f + @test length(keys(f)) == 200000 + end + + fn = "netcdf.nc" + jldopen(fn) do f + @test f["hello"] == ones(5) + @test_broken f["x"] + @test_broken f["z"] + @test f["grouped/data"] == 0:9 + @test_broken f["grouped/y"] + end + + fn = "simple.nc" + jldopen(fn) do f + @test f["dim1"] == [2, 4, 6] + @test f["dim2"] == ["a", "b", "c", "d"] + @test f["mydata"] == Matrix(reshape(1:12, 4, 3)) + JLD2.load_attributes(f, "dim1") # not sure what to test for. just not erroring so far + JLD2.load_attributes(f, "dim2") + JLD2.load_attributes(f, "mydata") + end + + # julia> using JLD + # julia> struct A; x::Int; y::Float64; z::String; end + # julia> save("jldstruct.jld", "a", A(1,2.0,"3")) + fn = "jldstruct.jld" + jldopen(fn) do f + a = f["a"] + @test a.x == 1 + @test a.y == 2.0 + @test a.z == "3" + end + + fn = "chunking1.h5" + jldopen(fn) do f + @test f["uncompressed_chunks"] == reshape(1:1000., 25, 40) + @test f["compressed_chunks"] == reshape(1:1000., 25, 40) + @test f["shuffle_compressed_chunks"] == reshape(1:1000, 25, 40) + @test size(f["incomplete_allocation"]) == (50,50,10) + @test f["incomplete_allocation"][1:50,1:50, 2] == reshape(1:2500, 50,50) + #f["incomplete_allocation"][1,1,1] == 0 + end + + end +end \ No newline at end of file diff --git a/test/test_files/chunking1.h5 b/test/test_files/chunking1.h5 new file mode 100644 index 00000000..8a94b1ab Binary files /dev/null and b/test/test_files/chunking1.h5 differ diff --git a/test/test_files/compound.h5 b/test/test_files/compound.h5 new file mode 100644 index 00000000..741acb01 Binary files /dev/null and b/test/test_files/compound.h5 differ diff --git a/test/test_files/h5ex_t_array.h5 b/test/test_files/h5ex_t_array.h5 new file mode 100644 index 00000000..9de9ef82 Binary files /dev/null and b/test/test_files/h5ex_t_array.h5 differ diff --git a/test/test_files/h5ex_t_enum.h5 b/test/test_files/h5ex_t_enum.h5 new file mode 100644 index 00000000..76a7f734 Binary files /dev/null and b/test/test_files/h5ex_t_enum.h5 differ diff --git a/test/test_files/h5ex_t_float.h5 b/test/test_files/h5ex_t_float.h5 new file mode 100644 index 00000000..9c8cb981 Binary files /dev/null and b/test/test_files/h5ex_t_float.h5 differ diff --git a/test/test_files/h5ex_t_int.h5 b/test/test_files/h5ex_t_int.h5 new file mode 100644 index 00000000..33f4cbba Binary files /dev/null and b/test/test_files/h5ex_t_int.h5 differ diff --git a/test/test_files/h5ex_t_objref.h5 b/test/test_files/h5ex_t_objref.h5 new file mode 100644 index 00000000..28751dd7 Binary files /dev/null and b/test/test_files/h5ex_t_objref.h5 differ diff --git a/test/test_files/h5ex_t_opaque.h5 b/test/test_files/h5ex_t_opaque.h5 new file mode 100644 index 00000000..4c740f5c Binary files /dev/null and b/test/test_files/h5ex_t_opaque.h5 differ diff --git a/test/test_files/h5ex_t_string.h5 b/test/test_files/h5ex_t_string.h5 new file mode 100644 index 00000000..7bd5112d Binary files /dev/null and b/test/test_files/h5ex_t_string.h5 differ diff --git a/test/test_files/h5ex_t_vlen.h5 b/test/test_files/h5ex_t_vlen.h5 new file mode 100644 index 00000000..374e15e1 Binary files /dev/null and b/test/test_files/h5ex_t_vlen.h5 differ diff --git a/test/test_files/h5ex_t_vlstring.h5 b/test/test_files/h5ex_t_vlstring.h5 new file mode 100644 index 00000000..00bb0370 Binary files /dev/null and b/test/test_files/h5ex_t_vlstring.h5 differ diff --git a/test/test_files/hdf5_test_files/h5ex_t_int.h5 b/test/test_files/hdf5_test_files/h5ex_t_int.h5 new file mode 100644 index 00000000..33f4cbba Binary files /dev/null and b/test/test_files/hdf5_test_files/h5ex_t_int.h5 differ diff --git a/test/test_files/jldstruct.jld b/test/test_files/jldstruct.jld new file mode 100644 index 00000000..3a17ad68 Binary files /dev/null and b/test/test_files/jldstruct.jld differ diff --git a/test/test_files/large_fractal_heap.h5 b/test/test_files/large_fractal_heap.h5 new file mode 100644 index 00000000..247c2b0c Binary files /dev/null and b/test/test_files/large_fractal_heap.h5 differ diff --git a/test/test_files/netcdf.nc b/test/test_files/netcdf.nc new file mode 100644 index 00000000..2d2d8b13 Binary files /dev/null and b/test/test_files/netcdf.nc differ diff --git a/test/test_files/nullterm_ascii.h5 b/test/test_files/nullterm_ascii.h5 new file mode 100644 index 00000000..7668afe2 Binary files /dev/null and b/test/test_files/nullterm_ascii.h5 differ diff --git a/test/test_files/simple.nc b/test/test_files/simple.nc new file mode 100644 index 00000000..a4cb2a85 Binary files /dev/null and b/test/test_files/simple.nc differ