forked from google/gipfeli
-
Notifications
You must be signed in to change notification settings - Fork 0
/
lz77.cc
302 lines (269 loc) · 11.2 KB
/
lz77.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
// Copyright 2011 Google Inc. All Rights Reserved.
#include "lz77.h"
#include <algorithm>
#include "find_match_length.h"
#include "enum.h"
#include "stubs-internal.h"
namespace util {
namespace compression {
namespace gipfeli {
namespace {
// Any hash function will produce a valid compressed bitstream, but a good
// hash function reduces the number of collisions and thus yields better
// compression for compressible input, and more speed for incompressible
// input. Of course, it doesn't hurt if the hash function is reasonably fast
// either, as it gets called a lot.
inline uint32 HashBytes(uint32 bytes, int shift) {
uint32 kMul = 0x1e35a7bd;
return (bytes * kMul) >> shift;
}
inline uint32 Hash(const char* p, int shift) {
return HashBytes(UNALIGNED_LOAD32(p), shift);
}
// For 0 <= offset <= 4, GetUint32AtOffset(UNALIGNED_LOAD64(p), offset) will
// equal UNALIGNED_LOAD32(p + offset). Motivation: On x86-64 hardware we have
// empirically found that overlapping loads such as
// UNALIGNED_LOAD32(p) ... UNALIGNED_LOAD32(p+1) ... UNALIGNED_LOAD32(p+2)
// are slower than UNALIGNED_LOAD64(p) followed by shifts and casts to uint32.
static inline uint32 GetUint32AtOffset(uint64 v, int offset) {
return v >> (LittleEndian::IsLittleEndian() ? 8 * offset : 32 - 8 * offset);
}
} // namespace
LZ77::LZ77(int input_size) {
commands_ = new uint32[MaxCompressedCommandsSize(input_size)];
content_ = new char[MaxCompressedContentSize(input_size)];
table_bits_ = Bits::Log2Floor(input_size | 0x100);
if (table_bits_ > kMaxHashTableBits) {
table_bits_ = kMaxHashTableBits;
}
hash_table_ = new uint16[1 << table_bits_];
memset(hash_table_, 0, (1 << table_bits_) * sizeof(*hash_table_));
}
LZ77::~LZ77() {
delete[] hash_table_;
delete[] content_;
delete[] commands_;
}
inline char* EmitLiteral(const char* literal,
int len,
const bool allow_fast_path,
char* op,
uint32* command) {
*command = len;
if (allow_fast_path && len <= 16) {
UNALIGNED_STORE64(op, UNALIGNED_LOAD64(literal));
UNALIGNED_STORE64(op + 8, UNALIGNED_LOAD64(literal + 8));
return op + len;
} else {
char* ret = op + len;
memcpy(op, literal, len);
return ret;
}
}
// REQUIRES length to be at least 4 and offset at least 1
inline void EmitCopy(const uint32 offset,
uint32 length,
uint32** commands) {
// In 80% cases length <= 7, and in 95% cases length <= 15.
// If the length is big we will split the copying operation to several
// smaller ones, such that each length, which is saved to commands,
// must be between 4 and 67, inclusive. We use later exactly 6-bits
// to store its value minus 4.
uint32 copy_command = COPY | offset;
while (length > 67) {
length -= 64;
**commands = copy_command | (64 << 24);
++(*commands);
}
**commands = copy_command | (length << 24);
++(*commands);
}
void EmitCopyForTesting(const uint32 offset,
uint32 length,
uint32* commands,
uint32* commands_size) {
uint32 *pt = commands + *commands_size;
EmitCopy(offset, length, &pt);
*commands_size = pt - commands;
}
// Compresses "input" string to the "content" and "commands" buffers.
void LZ77::CompressFragment(const char* input,
const size_t input_size,
const char* prev_block,
char** content,
uint32* content_size,
uint32** commands,
uint32* commands_size) {
uint32 *commands_current = commands_;
uint16* hash_table = hash_table_;
// "ip" is the input pointer, and "op" is the output pointer.
char* op = content_;
const char* ip = input;
const char* ip_end = input + input_size;
const char* base_ip = ip;
// Bytes in [next_emit, ip) will be emitted as literal bytes. Or
// [next_emit, ip_end) after the main loop.
const char* next_emit = ip;
const int shift = 32 - table_bits_;
const size_t kInputMarginBytes = 15;
if (PREDICT_TRUE(input_size >= kInputMarginBytes)) {
const char* ip_limit = input + input_size - kInputMarginBytes;
for (uint32 next_hash = Hash(++ip, shift); ; ) {
// The body of this loop calls EmitLiteral once and then EmitCopy one or
// more times. (The exception is that when we're close to exhausting
// the input we goto emit_remainder.)
//
// In the first iteration of this loop we're just starting, so
// there's nothing to copy, so calling EmitLiteral once is
// necessary. And we only start a new iteration when the
// current iteration has determined that a call to EmitLiteral will
// precede the next call to EmitCopy (if any).
//
// Step 1: Scan forward in the input looking for a 4-byte-long match.
// If we get close to exhausting the input then goto emit_remainder.
//
// Heuristic match skipping: If 32 bytes are scanned with no matches
// found, start looking only at every other byte. If 32 more bytes are
// scanned, look at every third byte, etc.. When a match is found,
// immediately go back to looking at every byte. This is a small loss
// (~5% performance, ~0.1% density) for compressible data due to more
// bookkeeping, but for non-compressible data (such as JPEG) it's a huge
// win since the compressor quickly "realizes" the data is incompressible
// and doesn't bother looking for matches everywhere.
//
// The "skip" variable keeps track of how many bytes there are since the
// last match; dividing it by 32 (ie. right-shifting by five) gives the
// number of bytes to move ahead for each iteration.
uint32 skip = 32;
const char* next_ip = ip;
const char* candidate = NULL;
bool in_prev_block = false;
do {
ip = next_ip;
const uint32 hash = next_hash;
const int bytes_between_hash_lookups = skip++ >> 5;
next_ip = ip + bytes_between_hash_lookups;
if (PREDICT_FALSE(next_ip > ip_limit)) {
goto emit_remainder;
}
next_hash = Hash(next_ip, shift);
size_t offset = hash_table[hash];
candidate = base_ip + offset;
if (candidate >= ip) {
candidate = prev_block + offset;
in_prev_block = true;
} else {
in_prev_block = false;
}
hash_table[hash] = ip - base_ip;
} while (UNALIGNED_LOAD32(ip) != UNALIGNED_LOAD32(candidate));
// Step 2: A 4-byte match has been found. We'll later see if more
// than 4 bytes match. But, prior to the match, input
// bytes [next_emit, ip) are unmatched. Emit them as "literal bytes."
op = EmitLiteral(next_emit, ip - next_emit, true, op, commands_current);
commands_current++;
// Step 3: Call EmitCopy, and then see if another EmitCopy could
// be our next move. Repeat until we find no match for the
// input immediately after what was consumed by the last EmitCopy call.
//
// If we exit this loop normally then we need to call EmitLiteral next,
// though we don't yet know how big the literal will be. We handle that
// by proceeding to the next iteration of the main loop. We also can exit
// this loop via goto if we get close to exhausting the input.
uint64 input_bytes = 0;
do {
// We have a 4-byte match at ip, and no need to emit any
// "literal bytes" prior to ip.
const char* base = ip;
int matched = 4;
if (!in_prev_block || ((prev_block + kBlockSize) == base_ip)
|| (candidate + 4 >= (prev_block + kBlockSize))) {
matched += util::compression::FindMatchLength(
reinterpret_cast<const uint8*>(candidate) + 4,
reinterpret_cast<const uint8*>(ip) + 4,
reinterpret_cast<const uint8*>(ip_end));
} else {
matched += MultiBlockFindMatchLength(
reinterpret_cast<const uint8*>(candidate) + 4,
reinterpret_cast<const uint8*>(prev_block + kBlockSize),
reinterpret_cast<const uint8*>(base_ip),
reinterpret_cast<const uint8*>(ip) + 4,
reinterpret_cast<const uint8*>(ip_end));
}
ip += matched;
int offset;
if (in_prev_block) {
offset = base - base_ip + prev_block + kBlockSize - candidate;
} else {
offset = base - candidate;
}
EmitCopy(offset, matched, &commands_current);
// We could immediately start working at ip now, but to improve
// compression we first update hash_table[Hash(ip - 1, ...)].
const char* insert_tail = ip - 3;
next_emit = ip;
if (PREDICT_FALSE(ip >= ip_limit)) {
goto emit_remainder;
}
input_bytes = UNALIGNED_LOAD64(insert_tail);
int n = insert_tail - base_ip;
uint32 prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 0), shift);
hash_table[prev_hash] = n++;
prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 1), shift);
hash_table[prev_hash] = n++;
prev_hash = HashBytes(GetUint32AtOffset(input_bytes, 2), shift);
hash_table[prev_hash] = n++;
uint32 cur_hash = HashBytes(GetUint32AtOffset(input_bytes, 3), shift);
int candidate_offset = hash_table[cur_hash];
candidate = base_ip + candidate_offset;
if (candidate >= ip) {
candidate = prev_block + candidate_offset;
in_prev_block = true;
} else {
in_prev_block = false;
}
hash_table[cur_hash] = n;
} while (GetUint32AtOffset(input_bytes, 3)
== UNALIGNED_LOAD32(candidate));
next_hash = HashBytes(GetUint32AtOffset(input_bytes, 4), shift);
++ip;
}
}
emit_remainder:
// Emit the remaining bytes as a literal
if (next_emit < ip_end) {
op = EmitLiteral(next_emit, ip_end - next_emit, false, op,
commands_current);
++commands_current;
}
*content = content_;
*content_size = op - content_;
*commands = commands_;
*commands_size = commands_current - commands_;
}
void LZ77::ResetHashTable(int input_size) {
table_bits_ = Bits::Log2Floor(input_size | 0x100);
if (table_bits_ > kMaxHashTableBits) {
table_bits_ = kMaxHashTableBits;
}
memset(hash_table_, 0, (1 << table_bits_) * sizeof(*hash_table_));
}
// The value of content is always a subsequence of the input.
size_t LZ77::MaxCompressedContentSize(size_t input_size) {
return input_size;
}
// The worst case scenario is if these two commands are repeated all the time:
// - emit literals of length 1 (use 1 command for it)
// - copy of length 4 (use 1 command for it).
// So we get at most 2 commands for each 5 symbols of the input.
// The exception when we need more commands than the input size is
// the input of size exactly 1, when we need 1 command.
//
// We need one extra command reserved because CompressCommands pre-fetches the
// next one before the comparison on array size.
size_t LZ77::MaxCompressedCommandsSize(size_t input_size) {
return 2 * input_size / 5 + 2;
}
} // namespace gipfeli
} // namespace compression
} // namespace util