Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow compressing arrays larger than 4 GB #105

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions src/ZipFile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -516,13 +516,7 @@ function update_reader!(f::ReadableFile, data::Array{UInt8})
f._zpos = position(f._io) - f._datapos
datalen = length(data)
f._pos += datalen
chunk_size = if Sys.WORD_SIZE > 32 2^31 else datalen end
start = 1
while datalen > 0
f._currentcrc32 = Zlib.crc32(view(data, start:start-1+min(datalen, chunk_size)), f._currentcrc32)
datalen -= chunk_size
start += chunk_size
end
f._currentcrc32 = Zlib.crc32(data, f._currentcrc32)

if eof(f)
if f.method == Deflate
Expand Down Expand Up @@ -669,17 +663,16 @@ Base.readavailable(io::ZipFile.ReadableFile) = read(io)

# Write nb elements located at p into f.
function unsafe_write(f::WritableFile, p::Ptr{UInt8}, nb::UInt)
# zlib doesn't like 0 length writes
if nb == 0
return 0
return UInt(0)
end

n = unsafe_write(f._zio, p, nb)
if n != nb
error("short write")
end

f.crc32 = Zlib.crc32(unsafe_wrap(Array, p, nb), f.crc32)
f.crc32 = Zlib.unsafe_crc32(p, nb, f.crc32)
f.uncompressedsize += n
n
end
Expand Down
75 changes: 34 additions & 41 deletions src/Zlib.jl
Original file line number Diff line number Diff line change
Expand Up @@ -122,15 +122,20 @@ end

Writer(io::IO, raw::Bool=false) = Writer(io, 9, raw)

function write(w::Writer, p::Ptr, nb::Integer)
function Base.unsafe_write(w::Writer, p::Ptr{UInt8}, nb::UInt)::UInt
if nb == 0
return UInt(0)
end
max_chunk_size::UInt = UInt(typemax(Cuint))>>1
chunk_offset = UInt(0)
num_bytes_left = nb
chunk_size = min(max_chunk_size, num_bytes_left)
w.strm.avail_in = chunk_size
w.strm.next_in = p
w.strm.avail_in = nb
outbuf = Vector{UInt8}(undef, 1024)

GC.@preserve outbuf while true
w.strm.avail_out = length(outbuf)
w.strm.next_out = pointer(outbuf)

ret = ccall((:deflate, libz),
Int32, (Ptr{z_stream}, Int32),
Ref(w.strm), Z_NO_FLUSH)
Expand All @@ -139,10 +144,21 @@ function write(w::Writer, p::Ptr, nb::Integer)
end

n = length(outbuf) - w.strm.avail_out
if n > 0 && write(w.io, outbuf[1:n]) != n
if n > 0 && write(w.io, view(outbuf,1:n)) != n
error("short write")
end
if w.strm.avail_out != 0
# Update w.strm.avail_in if needed
if w.strm.avail_in == 0
# mark that previous chunk was written
chunk_offset += chunk_size
num_bytes_left -= chunk_size
# new chunk size, will be zero at the end.
chunk_size = min(max_chunk_size, num_bytes_left)
@assert chunk_offset + chunk_size ≤ nb
w.strm.next_in = p + chunk_offset
w.strm.avail_in = chunk_size
end
if (w.strm.avail_out != 0) && (w.strm.avail_in == 0)
break
end
end
Expand All @@ -151,38 +167,8 @@ function write(w::Writer, p::Ptr, nb::Integer)
nb
end

function write(w::Writer, a::Array{UInt8})
GC.@preserve a write(w, pointer(a), length(a))
end

# If this is not provided, Base.IO write methods will write
# arrays one element at a time.
function write(w::Writer, a::Array{T}) where T
if isbits(T)
GC.@preserve a write(w, pointer(a), length(a)*sizeof(T))
else
invoke(write, Tuple{IO,Array}, w, a)
end
end

# Copied from Julia base/io.jl
function write(w::Writer, a::SubArray{T,N,A}) where {T,N,A<:Array}
if !isbits(T) || stride(a,1)!=1
return invoke(write, Tuple{Any,AbstractArray}, s, a)
end
colsz = size(a,1)*sizeof(T)
if N<=1
return GC.@preserve a write(s, pointer(a, 1), colsz)
else
for idx in CartesianRange(tuple(1, size(a)[2:end]...))
GC.@preserve a write(w, pointer(a, idx.I), colsz)
end
return colsz*Base.trailingsize(a,2)
end
end

function write(w::Writer, b::UInt8)
write(w, UInt8[b])
write(w, Ref(b))
end

function close(w::Writer)
Expand Down Expand Up @@ -365,10 +351,17 @@ function eof(r::Reader)
bytesavailable(r.buf) == 0 && eof(r.io)
end

function crc32(data::AbstractArray{UInt8}, crc::Integer=0)
convert(UInt32, (ccall((:crc32, libz),
Culong, (Culong, Ptr{UInt8}, Cuint),
crc, data, length(data))))
function unsafe_crc32(p::Ptr{UInt8}, nb::UInt, crc::UInt32)::UInt32
ccall((:crc32_z, libz),
Culong, (Culong, Ptr{UInt8}, Csize_t),
crc, p, nb,
)
end

function crc32(data::AbstractArray{UInt8}, crc::Integer=0)::UInt32
GC.@preserve data begin
unsafe_crc32(pointer(data), UInt(length(data)), UInt32(crc))
end
end

crc32(data::AbstractString, crc::Integer=0) = crc32(convert(AbstractArray{UInt8}, data), crc)
Expand Down
43 changes: 43 additions & 0 deletions test/bigtests.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# These tests require over 8 GB of memory and a 64 bit Int

using ZipFile
using Test

@testset "big array with Zlib" begin
big_array = collect(1:2^29+2^25)

io = IOBuffer()
w = ZipFile.Zlib.Writer(io, 1, true)
write(w, big_array)
close(w)
w = nothing
@info "done writing big_array"
seekstart(io)
r = ZipFile.Zlib.Reader(io, true)
buffer = zeros(Int, 2^22)
for bi in 1:(length(big_array)>>22)
read!(r, buffer)
@test ((bi-1)<<22+1):(bi<<22) == buffer
end
close(r)
r = nothing
close(io)
io = nothing
@info "done reading big_array"

# Check that crc32 works
crc32_big::UInt32 = ZipFile.Zlib.crc32(
reinterpret(UInt8, big_array)
)
crc32_parts::UInt32 = 0
for bi in 1:(length(big_array)>>22)
crc32_parts = ZipFile.Zlib.crc32(
reinterpret(UInt8, view(big_array,((bi-1)<<22+1):(bi<<22))),
crc32_parts
)
end
@test crc32_parts == crc32_big


big_array = nothing
end
4 changes: 4 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,8 @@ if !Debug
rm(tmp, recursive=true)
end

if "bigtests" in ARGS
include("bigtests.jl")
end

println("done")