Remove a few redundant code paths in LZ77 C++ implementation, fix a bug preventing us from finding the optimal compression.

This commit is contained in:
Jennifer Taylor 2020-12-20 03:38:34 +00:00
parent 6ab6dc89c4
commit c431e30d3b

View File

@ -30,12 +30,6 @@ extern "C"
bool eof = false;
while (inloc < inlen && !eof)
{
if (inloc >= inlen)
{
// We failed to decompress, we overran the input buffer.
return -2;
}
uint8_t flags = indata[inloc++];
for (unsigned int flagpos = 0; flagpos < 8; flagpos++)
{
@ -62,8 +56,9 @@ extern "C"
// We failed to decompress, we overran the input buffer.
return -2;
}
uint8_t hi = indata[inloc++];
uint8_t lo = indata[inloc++];
unsigned int hi = indata[inloc++];
unsigned int lo = indata[inloc++];
unsigned int copy_len = (lo & 0xF) + 3;
unsigned int copy_pos = (hi << 4) | (lo >> 4);
@ -78,22 +73,22 @@ extern "C"
// Copy backref a byte at a time. This is because a backref can stick
// out into as-of-yet uncopied data in order to reference what we're
// about to write.
for (unsigned int backref_copy_amt = 0; backref_copy_amt < copy_len; backref_copy_amt++)
if (outloc + copy_len > outlen)
{
if (outloc >= outlen)
{
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
if (((int)outloc - (int)copy_pos) < 0)
int backref_start_loc = (int)outloc - (int)copy_pos;
for (int backref_copy_pos = backref_start_loc; backref_copy_pos < backref_start_loc + (int)copy_len; backref_copy_pos++)
{
if (backref_copy_pos < 0)
{
outdata[outloc++] = 0;
}
else
{
outdata[outloc] = outdata[outloc - copy_pos];
outloc++;
outdata[outloc++] = outdata[backref_copy_pos];
}
}
}
@ -114,60 +109,78 @@ extern "C"
while (!eof)
{
if (inloc == inlen)
if (outloc >= outlen)
{
if (outloc > (outlen - 3))
{
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
// We hit the end of the compressable data and we don't have a flag byte to
// add on to. Add a new empty flag byte.
outdata[outloc++] = 0;
// Add a backref pointing at the current byte to signify end of file.
outdata[outloc++] = 0;
outdata[outloc++] = 0;
// Bail out of the loop, we're done!
eof = true;
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
else
// Add a spot for the flag byte, we'll fill this in later.
unsigned int flagsloc = outloc;
outdata[outloc++] = 0;
for (unsigned int flagpos = 0; flagpos < 8; flagpos++)
{
if (outloc >= outlen)
if (inloc == inlen)
{
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
// Add a spot for the flag byte.
unsigned int flagsloc = outloc;
outdata[outloc++] = 0;
for (unsigned int flagpos = 0; (flagpos < 8 && !eof); flagpos++)
{
if (inloc == inlen)
if (outloc > (outlen - 2))
{
if (outloc > (outlen - 3))
{
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
// We hit the end of compressable data and we are mid flag byte.
// Set the particular flag bit to a backref and point at the current
// byte to signify end of file.
outdata[flagsloc] |= (FLAG_BACKREF << flagpos);
// Add the backref itself.
outdata[outloc++] = 0;
outdata[outloc++] = 0;
// Bail out of the loop, we're done!
eof = true;
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
else if (inloc < 3 || inloc >= (inlen - 3))
// We hit the end of compressable data and we are mid flag byte.
// Set the particular flag bit to a backref and point at the current
// byte to signify end of file.
outdata[flagsloc] |= (FLAG_BACKREF << flagpos);
// Add the backref itself.
outdata[outloc++] = 0;
outdata[outloc++] = 0;
// Bail out of the loop, we're done!
eof = true;
break;
}
else if (inloc < 3 || inloc >= (inlen - 3))
{
if (outloc >= outlen)
{
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
// We either don't have enough data written to backref, or we
// don't have enough data in the stream that could be made into
// a backref. Set the particular flag bit to a copy and then
// output that byte to the compressed stream.
outdata[flagsloc] |= (FLAG_COPY << flagpos);
// Update our key to reflect this byte coming out as long as we aren't
// in the first two bytes (we wouldn't have a 3 byte prefix in the key yet).
key = ((key << 8) | indata[inloc]) & 0xFFFFFF;
if (inloc >= 2)
{
starts[key].push_back(inloc - 2);
}
// Output this byte specifically
outdata[outloc++] = indata[inloc++];
}
else
{
// Figure out the maximum backref amount we can reference.
unsigned int backref_amount = std::min(inlen - inloc, MAX_BACKREF);
unsigned int earliest_backref = std::max(0, (int)inloc - (RING_LEN - 1));
uint32_t search_key = (indata[inloc] << 16) | (indata[inloc + 1] << 8) | (indata[inloc + 2]);
// Prune anything that we can't backref.
starts[search_key].remove_if([earliest_backref](auto val)
{
return val < earliest_backref;
});
if (starts[search_key].size() == 0)
{
if (outloc >= outlen)
{
@ -175,112 +188,81 @@ extern "C"
return -3;
}
// We either don't have enough data written to backref, or we
// don't have enough data in the stream that could be made into
// a backref. Set the particular flag bit to a copy and then
// output that byte to the compressed stream.
// We couldn't find a previous data in range of a backref.
outdata[flagsloc] |= (FLAG_COPY << flagpos);
// Update our key to reflect this byte coming out.
key = ((key << 8) | indata[inloc]) & 0xFFFFFF;
if (inloc >= 2)
{
starts[key].push_back(inloc - 2);
}
starts[key].push_back(inloc - 2);
// Output this byte specifically
outdata[outloc++] = indata[inloc++];
}
else
{
// Figure out the maximum backref amount we can reference.
unsigned int backref_amount = std::min(inlen - inloc, MAX_BACKREF);
unsigned int earliest_backref = std::max(0, (int)inloc - (RING_LEN - 1));
uint32_t search_key = (indata[inloc] << 16) | (indata[inloc + 1] << 8) | (indata[inloc + 2]);
int best_backref = -1;
unsigned int best_length = 1;
// Prune anything that we can't backref.
starts[search_key].remove_if([earliest_backref](auto val)
for (auto possible_backref = starts[search_key].begin(); possible_backref != starts[search_key].end(); possible_backref++)
{
return val < earliest_backref;
});
if (starts[search_key].size() == 0)
{
if (outloc >= outlen)
// If the current best length isn't a match on this chunk, then we shouldn't even consider it
// since the other chunk is already a better match.
if (indata[(*possible_backref) + (best_length - 1)] != indata[inloc + (best_length - 1)])
{
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
continue;
}
// We couldn't find a previous data in range of a backref.
outdata[flagsloc] |= (FLAG_COPY << flagpos);
// We already know that the first three match so we don't need to check those;
unsigned int current_length;
for (current_length = 3; current_length < backref_amount; current_length++)
{
if (indata[(*possible_backref) + current_length] != indata[inloc + current_length])
{
// This value doesn't match, so the current length is the longest prefix
// for this possible backref.
break;
}
}
// Update our key to reflect this byte coming out.
// We found a better match
if (best_length < current_length)
{
best_length = current_length;
best_backref = (inloc - *possible_backref) & 0xFFF;
}
else if (best_length == backref_amount)
{
// We found an ideal length, no need to keep searching.
break;
}
}
if (best_backref <= 0)
{
// Double check, since we know we should have found a backref.
return -2;
}
if (outloc > (outlen - 2))
{
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
// We got a valid backref, so let's record it as well as the start positions
// for each of the bytes we compressed.
outdata[flagsloc] |= (FLAG_BACKREF << flagpos);
// Add the backref itself.
outdata[outloc++] = (best_backref >> 4) & 0xFF;
outdata[outloc++] = ((best_backref & 0xF) << 4) | ((best_length - 3) & 0xF);
// Record the keys for each byte;
for (unsigned int i = 0; i < best_length; i++)
{
key = ((key << 8) | indata[inloc]) & 0xFFFFFF;
starts[key].push_back(inloc - 2);
// Output this byte specifically
outdata[outloc++] = indata[inloc++];
}
else
{
int best_backref = -1;
unsigned int best_length = 0;
for (auto possible_backref = starts[search_key].begin(); possible_backref != starts[search_key].end(); possible_backref++)
{
// We already know that the first three match so we don't need to check those;
unsigned int current_length;
for (current_length = 3; current_length < backref_amount; current_length++)
{
if (indata[(*possible_backref) + current_length] != indata[inloc + current_length])
{
break;
}
}
// We found a non-match.
if (best_length < current_length)
{
best_length = current_length;
best_backref = (inloc - *possible_backref) & 0xFFF;
break;
}
if (best_length == backref_amount)
{
// We found an ideal length, no need to keep searching.
break;
}
}
if (best_backref <= 0)
{
// Double check, since we know we should have found a backref.
return -2;
}
if (outloc > (outlen - 2))
{
// We overwrote our output buffer, we probably corrupted memory somewhere.
return -3;
}
// We got a valid backref, so let's record it as well as the start positions
// for each of the bytes we compressed.
outdata[flagsloc] |= (FLAG_BACKREF << flagpos);
// Add the backref itself.
outdata[outloc++] = (best_backref >> 4) & 0xFF;
outdata[outloc++] = ((best_backref & 0xF) << 4) | ((best_length - 3) & 0xF);
// Record the keys for each byte;
for (unsigned int i = 0; i < best_length; i++)
{
key = ((key << 8) | indata[inloc]) & 0xFFFFFF;
starts[key].push_back(inloc - 2);
inloc++;
}
inloc++;
}
}
}