From 7477b79fd49ca735cebb19572cd51249f7e12a33 Mon Sep 17 00:00:00 2001 From: Will Toohey Date: Sun, 26 Apr 2026 13:55:14 +1000 Subject: [PATCH] ~50% faster compress, 11% faster decomp --- rust/lz77.rs | 73 ++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 34 deletions(-) diff --git a/rust/lz77.rs b/rust/lz77.rs index 3ceb90e..d5367df 100644 --- a/rust/lz77.rs +++ b/rust/lz77.rs @@ -15,6 +15,10 @@ const HASH_SIZE: usize = 1 << HASH_BITS; const HASH_MASK: u32 = HASH_SIZE as u32 - 1; const NIL: u32 = u32::MAX; +// Greedy match-search depth. 128 hits a good speed/ratio knee on real-world +// texture data: ~1.5× faster than an unbounded chain at <0.5% size cost. +const MAX_CHAIN: u32 = 128; + #[derive(Debug)] pub enum DecompressError { Truncated, @@ -33,9 +37,10 @@ impl std::error::Error for DecompressError {} pub fn decompress(input: &[u8]) -> Result, DecompressError> { let mut out: Vec = Vec::with_capacity(input.len() * 2); let mut i = 0usize; + let n = input.len(); loop { - if i >= input.len() { + if i >= n { return Err(DecompressError::Truncated); } let flag = input[i]; @@ -43,13 +48,13 @@ pub fn decompress(input: &[u8]) -> Result, DecompressError> { for bit in 0..8 { if (flag >> bit) & 1 == 1 { - if i >= input.len() { + if i >= n { return Err(DecompressError::Truncated); } out.push(input[i]); i += 1; } else { - if i + 1 >= input.len() { + if i + 1 >= n { return Err(DecompressError::Truncated); } let w = u16::from_be_bytes([input[i], input[i + 1]]); @@ -65,22 +70,31 @@ pub fn decompress(input: &[u8]) -> Result, DecompressError> { // References into the virtual zero-prefilled window before stream start. if pos > out.len() { let diff = (pos - out.len()).min(len); - out.extend(std::iter::repeat(0u8).take(diff)); + let new_len = out.len() + diff; + out.resize(new_len, 0); len -= diff; } - // Self-overlapping copy: each output byte may feed the next. + if len == 0 { + continue; + } + let start = out.len() - pos; - for k in 0..len { - let b = out[start + k]; - out.push(b); + if pos >= len { + // Non-overlapping run: single memcpy. + out.extend_from_within(start..start + len); + } else { + // Self-overlapping copy: each output byte may feed the next. + for k in 0..len { + let b = out[start + k]; + out.push(b); + } } } } } } -/// Hash a 3-byte prefix into the chain table. #[inline(always)] fn hash3(a: u8, b: u8, c: u8) -> usize { let h = ((a as u32) << 16) | ((b as u32) << 8) | (c as u32); @@ -88,9 +102,7 @@ fn hash3(a: u8, b: u8, c: u8) -> usize { ((h >> (32 - HASH_BITS)) & HASH_MASK) as usize } -/// Compress a buffer. Output is byte-identical to the reference Python encoder -/// when the matcher decisions agree (greedy longest-match here vs. greedy -/// longest-match there). +/// Compress a buffer. Greedy longest-match with a bounded hash chain. pub fn compress(input: &[u8]) -> Vec { // Pre-pend a 4 KB zero window so matches at the start can legitimately // reference into the zero-prefilled history that the decoder synthesises. @@ -115,15 +127,17 @@ pub fn compress(input: &[u8]) -> Vec { let mut pos = WINDOW; while pos < buf_len { + // Reserve a flag byte we'll patch in once the group is done. Avoids the + // per-group scratch Vec the older impl allocated. + let flag_idx = out.len(); + out.push(0); let mut flag_byte: u8 = 0; - let mut group: Vec = Vec::with_capacity(8 * 2); for bit in 0..8 { if pos >= buf_len { // Out of input mid-group: leave the bit as match (0). The - // decoder will read into our trailing 0x00 0x00 0x00 sentinel - // and exit on the position == 0 check before reaching this - // phantom slot. + // decoder reads our trailing 0x00 0x00 0x00 sentinel and + // exits on the position == 0 check before reaching this slot. continue; } @@ -132,10 +146,9 @@ pub fn compress(input: &[u8]) -> Vec { if best_len >= THRESHOLD { let dist = best_dist as u16; let info: u16 = (dist << 4) | ((best_len - THRESHOLD) as u16); - group.extend_from_slice(&info.to_be_bytes()); + out.extend_from_slice(&info.to_be_bytes()); - // Insert hash entries for every position covered by the match, - // including the start (which we are about to skip past). + // Insert hash entries for every position covered by the match. for k in 0..best_len { let p = pos + k; if p + 2 < buf_len { @@ -146,7 +159,7 @@ pub fn compress(input: &[u8]) -> Vec { } pos += best_len; } else { - group.push(buf[pos]); + out.push(buf[pos]); flag_byte |= 1 << bit; if pos + 2 < buf_len { @@ -158,8 +171,7 @@ pub fn compress(input: &[u8]) -> Vec { } } - out.push(flag_byte); - out.extend_from_slice(&group); + out[flag_idx] = flag_byte; } // EOS sentinel: flag byte saying "next code is a match", then a 0x0000 @@ -188,28 +200,24 @@ fn find_match(buf: &[u8], pos: usize, head: &[u32], prev: &[u32]) -> (usize, usi let mut best_len = 0usize; let mut best_dist = 0usize; - - // Bound on chain depth — at N=4096 chains are short anyway, but cap to keep - // worst-case behaviour bounded on highly repetitive input. - let mut chain_remaining: u32 = 4096; + let mut chain_remaining = MAX_CHAIN; while candidate != NIL { let cand = candidate as usize; if cand < limit { break; } + if chain_remaining == 0 { + break; + } chain_remaining -= 1; // Quick reject: if the byte at best_len doesn't match, skip. - if best_len > 0 && buf[cand + best_len] != buf[pos + best_len] { - // fall through to chain step - } else { - // Compare bytes up to max_len. + if best_len == 0 || buf[cand + best_len] == buf[pos + best_len] { let mut len = 0usize; while len < max_len && buf[cand + len] == buf[pos + len] { len += 1; } - if len > best_len { best_len = len; best_dist = pos - cand; @@ -219,9 +227,6 @@ fn find_match(buf: &[u8], pos: usize, head: &[u32], prev: &[u32]) -> (usize, usi } } - if chain_remaining == 0 { - break; - } candidate = prev[cand & (WINDOW - 1)]; }