More optimization: move code to IWRAM.

We can now decompress a 512 byte charset in 1 ms. (well, 1.370 ms to be exact)
This commit is contained in:
Philippe Symons 2025-04-28 12:34:19 +02:00
parent 75f8e5a26a
commit 652f781454
3 changed files with 86 additions and 113 deletions

View File

@ -170,6 +170,7 @@ BINFILES := $(foreach dir,../$(DATA),$(notdir $(wildcard $(dir)/*.*)))
export OFILES_BIN := $(addsuffix .o,$(BINFILES)) export OFILES_BIN := $(addsuffix .o,$(BINFILES))
OFILES += $(OFILES_BIN) OFILES += $(OFILES_BIN)
# Optimize zx0_decompressor for speed
zx0_decompressor.o: CXXFLAGS += -O2 zx0_decompressor.o: CXXFLAGS += -O2
#--------------------------------------------------------------------------------- #---------------------------------------------------------------------------------

View File

@ -9,10 +9,6 @@
// Original implementation can be found here: https://github.com/einar-saukas/ZX0 // Original implementation can be found here: https://github.com/einar-saukas/ZX0
// However, we've implemented a custom variant of this algorithm. // However, we've implemented a custom variant of this algorithm.
// (for instance: we're storing the uncompressed size in the first 2 bytes in little endian) // (for instance: we're storing the uncompressed size in the first 2 bytes in little endian)
// Our implementation "streams" the decompression: the decompression buffer is only 2 KB, so it can't fit the entire
// uncompressed file at once. Therefore it uses a ringbuffer to stream the decompression on-demand.
extern "C" extern "C"
{ {
/** /**
@ -31,7 +27,7 @@ extern "C"
* @brief This function copies <num_bytes> of decompressed data into the specified <output_buffer> * @brief This function copies <num_bytes> of decompressed data into the specified <output_buffer>
* It will trigger decompression on the go (streaming basis) * It will trigger decompression on the go (streaming basis)
*/ */
uint32_t zx0_decompressor_read(uint32_t num_bytes); void zx0_decompressor_read(uint32_t num_bytes);
} }
#endif #endif

View File

@ -1,13 +1,11 @@
#include "zx0_decompressor.h" #include "zx0_decompressor.h"
#include <cstring> #include <cstring>
#include <tonc.h>
// The following code is a custom implementation of the ZX0 decompression algorithm invented by Einar Saukas // The following code is a custom implementation of the ZX0 decompression algorithm invented by Einar Saukas
// Original implementation can be found here: https://github.com/einar-saukas/ZX0 // Original implementation can be found here: https://github.com/einar-saukas/ZX0
// It uses classes, but keeps them completely hidden in the .cpp file with an anonymous namespace for internal linkage.
// The header provides a C facade to access the relevant methods, but the rest of Poke Transporter GB // The header provides a C facade to access the relevant methods, but the rest of Poke Transporter GB
// doesn't need to be aware of all the datatypes/classes defined here. // doesn't need to be aware of all the datatypes/classes defined here.
namespace
{
/** /**
* This class makes reading on a per-bit basis much easier. * This class makes reading on a per-bit basis much easier.
*/ */
@ -16,14 +14,12 @@ class BitReader
public: public:
BitReader(const uint8_t* buffer); BitReader(const uint8_t* buffer);
uint32_t readBit(); IWRAM_CODE uint32_t read(uint32_t numBits);
uint32_t read(uint32_t numBits);
protected: protected:
private: private:
const uint8_t* buffer_; const uint8_t* cur_buffer_;
const uint8_t* curBuffer_; uint32_t cur_dword_;
uint32_t currentDWord_; uint32_t bits_left_;
uint32_t bitsLeft_;
}; };
enum class ZX0OperationType enum class ZX0OperationType
@ -36,10 +32,10 @@ enum class ZX0OperationType
typedef struct ZX0Command typedef struct ZX0Command
{ {
ZX0OperationType cmdType; ZX0OperationType cmd_type;
uint32_t length; uint32_t length;
uint32_t offset; uint32_t offset;
uint32_t bytePos; uint32_t byte_pos;
} ZX0Command; } ZX0Command;
/** /**
@ -62,50 +58,43 @@ public:
* @brief Retrieves the size of the data when it is fully decompressed * @brief Retrieves the size of the data when it is fully decompressed
* This is read from the first 2 bytes of the inputData * This is read from the first 2 bytes of the inputData
*/ */
uint32_t getDecompressedSize() const; IWRAM_CODE uint32_t get_decompressed_size() const;
/** /**
* @brief This function reads <numBytes> of data into <outputBuffer> * @brief This function reads <numBytes> of data into <outputBuffer>
*/ */
uint32_t read(uint32_t numBytes); IWRAM_CODE void read(uint32_t num_bytes);
protected: protected:
private: private:
void readNextCommand(); IWRAM_CODE void read_next_command();
uint32_t copy_block(uint32_t numBytes); IWRAM_CODE uint32_t copy_block(uint32_t num_bytes);
BitReader reader_; BitReader reader_;
ZX0Command cur_command_; ZX0Command cur_command_;
const uint8_t *inputData_; const uint8_t *input_data_;
uint8_t *back_pos_; uint8_t *back_pos_;
uint8_t *cur_out; uint8_t *cur_out;
uint32_t bytesDecompressed_; uint32_t last_offset_;
uint32_t lastOffset_;
}; };
static inline uint32_t read_elias_gamma(BitReader& reader) IWRAM_CODE static inline uint32_t read_elias_gamma(BitReader& reader)
{ {
uint32_t num_non_leading_bits = 0; uint32_t num_non_leading_bits = 0;
uint32_t value; uint32_t value;
while (!reader.readBit()) while (!reader.read(1))
{ {
++num_non_leading_bits; // Count leading zeros ++num_non_leading_bits; // Count leading zeros
} }
// reconstruct the most significant bit of value // reconstruct the most significant bit of value
value = 1 << num_non_leading_bits; // Start with MSB value = (1 << num_non_leading_bits) | reader.read(num_non_leading_bits); // Start with MSB
// now apply the binary part (the actual value)
while(num_non_leading_bits--)
{
value |= (reader.readBit() << num_non_leading_bits);
}
// Adjust back to zero-based // Adjust back to zero-based
return value - 1; return value - 1;
} }
static inline void read_new_offset(BitReader& reader, uint32_t& offset) IWRAM_CODE static inline void read_new_offset(BitReader& reader, uint32_t& offset)
{ {
const uint32_t has_msb = reader.readBit(); const uint32_t has_msb = reader.read(1);
const uint32_t lsb = reader.read(7); const uint32_t lsb = reader.read(7);
const uint32_t msb = (has_msb) ? read_elias_gamma(reader) : 0; const uint32_t msb = (has_msb) ? read_elias_gamma(reader) : 0;
@ -114,45 +103,39 @@ static inline void read_new_offset(BitReader& reader, uint32_t& offset)
} }
BitReader::BitReader(const uint8_t* buffer) BitReader::BitReader(const uint8_t* buffer)
: buffer_(buffer) : cur_buffer_(buffer)
, curBuffer_(buffer) , cur_dword_(0)
, currentDWord_(0) , bits_left_(0)
, bitsLeft_(0)
{ {
} }
inline uint32_t BitReader::readBit() IWRAM_CODE inline uint32_t BitReader::read(uint32_t num_bits)
{ {
return read(1); uint32_t result;
}
inline uint32_t BitReader::read(uint32_t numBits)
{
uint32_t result = 0;
// Fast path: Read all bits from cached data // Fast path: Read all bits from cached data
if (numBits <= bitsLeft_) if (num_bits <= bits_left_)
{ {
result = (currentDWord_ >> (bitsLeft_ - numBits)) & ((1 << numBits) - 1); result = (cur_dword_ >> (bits_left_ - num_bits)) & ((1 << num_bits) - 1);
bitsLeft_ -= numBits; bits_left_ -= num_bits;
return result; return result;
} }
// Slow path: Refill cache and combine bits // Slow path: Refill cache and combine bits
result = currentDWord_ & ((1 << bitsLeft_) - 1); result = cur_dword_ & ((1 << bits_left_) - 1);
numBits -= bitsLeft_; num_bits -= bits_left_;
// Refill cache (32-bit aligned read) // Refill cache (32-bit aligned read)
// but the GBA (or x86 processor on pc) would read the value as little endian. // but the GBA (or x86 processor on pc) would read the value as little endian.
// and we need it as big endian. Therefore we do a byte swap // and we need it as big endian. Therefore we do a byte swap
currentDWord_ = __builtin_bswap32(*(uint32_t*)curBuffer_); cur_dword_ = __builtin_bswap32(*(uint32_t*)cur_buffer_);
curBuffer_ += sizeof(uint32_t); cur_buffer_ += sizeof(uint32_t);
bitsLeft_ = 32; bits_left_ = 32;
// Combine remaining bits // Combine remaining bits
result = (result << numBits) | (currentDWord_ >> (32 - numBits)); result = (result << num_bits) | (cur_dword_ >> (32 - num_bits));
bitsLeft_ -= numBits; bits_left_ -= num_bits;
return result; return result;
} }
@ -160,11 +143,10 @@ inline uint32_t BitReader::read(uint32_t numBits)
ZX0Decompressor::ZX0Decompressor() ZX0Decompressor::ZX0Decompressor()
: reader_(nullptr) : reader_(nullptr)
, cur_command_({ZX0OperationType::NONE, 0, 0, 0}) , cur_command_({ZX0OperationType::NONE, 0, 0, 0})
, inputData_(nullptr) , input_data_(nullptr)
, back_pos_(nullptr) , back_pos_(nullptr)
, cur_out(nullptr) , cur_out(nullptr)
, bytesDecompressed_(0) , last_offset_(UINT32_MAX)
, lastOffset_(UINT32_MAX)
{ {
} }
@ -172,144 +154,138 @@ void ZX0Decompressor::start(uint8_t *output_buffer, const uint8_t *input_data)
{ {
reader_ = BitReader(input_data + 4); reader_ = BitReader(input_data + 4);
cur_command_ = {ZX0OperationType::NONE, 0, 0, 0}; cur_command_ = {ZX0OperationType::NONE, 0, 0, 0};
inputData_ = input_data; input_data_ = input_data;
back_pos_ = nullptr; back_pos_ = nullptr;
cur_out = output_buffer; cur_out = output_buffer;
bytesDecompressed_ = 0; last_offset_ = UINT32_MAX;
lastOffset_ = UINT32_MAX;
} }
uint32_t ZX0Decompressor::getDecompressedSize() const IWRAM_CODE uint32_t ZX0Decompressor::get_decompressed_size() const
{ {
if(!inputData_) if(!input_data_)
{ {
return 0; return 0;
} }
return *((uint32_t*)inputData_); return *((uint32_t*)input_data_);
} }
uint32_t ZX0Decompressor::read(uint32_t numBytes) IWRAM_CODE void ZX0Decompressor::read(uint32_t num_bytes)
{ {
const uint32_t decompressed_size = getDecompressedSize(); while(num_bytes)
const uint32_t bytesDecompressedBefore = bytesDecompressed_;
uint32_t bytesRead;
while(numBytes && bytesDecompressed_ < decompressed_size)
{ {
// Check if we have finished processing the previous pending command // Check if we have finished processing the previous pending command
// if we have, we need to read a new operation // if we have, we need to read a new operation
if(cur_command_.cmdType == ZX0OperationType::NONE || cur_command_.bytePos >= cur_command_.length) if(cur_command_.byte_pos >= cur_command_.length)
{ {
readNextCommand(); read_next_command();
} }
bytesRead = copy_block(numBytes); const uint32_t bytes_read = copy_block(num_bytes);
numBytes -= bytesRead; num_bytes -= bytes_read;
bytesDecompressed_ += bytesRead;
} }
return bytesDecompressed_ - bytesDecompressedBefore;
} }
inline void ZX0Decompressor::readNextCommand() IWRAM_CODE inline void ZX0Decompressor::read_next_command()
{ {
const uint32_t cmdBit = reader_.readBit(); const uint32_t cmd_bit = reader_.read(1);
// the "COPY_NEW_OFFSET" command adds + 1 to the length, but the other commands don't. // the "COPY_NEW_OFFSET" command adds + 1 to the length, but the other commands don't.
// given that read_elias_gamma() function is marked "inline", the way I set the length // given that read_elias_gamma() function is marked "inline", the way I set the length
// is to avoid having multiple calls to it here. (for code size) // is to avoid having multiple calls to it here. (for code size)
if(cmdBit) if(cmd_bit)
{ {
read_new_offset(reader_, lastOffset_); read_new_offset(reader_, last_offset_);
cur_command_.cmdType = ZX0OperationType::COPY_NEW_OFFSET; cur_command_.cmd_type = ZX0OperationType::COPY_NEW_OFFSET;
cur_command_.length = 1; cur_command_.length = 1;
cur_command_.offset = lastOffset_; cur_command_.offset = last_offset_;
cur_command_.bytePos = 0;
} }
else if(cur_command_.cmdType == ZX0OperationType::LITERAL_BLOCK) else if(cur_command_.cmd_type == ZX0OperationType::LITERAL_BLOCK)
{ {
cur_command_.cmdType = ZX0OperationType::COPY_LAST_OFFSET; cur_command_.cmd_type = ZX0OperationType::COPY_LAST_OFFSET;
// copy from new offset and last offset differs in the sense that with the new offset the encoded length is reduced by one // copy from new offset and last offset differs in the sense that with the new offset the encoded length is reduced by one
// and for last offset it isn't. This is likely because you still need to be able to insert a dummy "copy-from-last-offset" operation. // and for last offset it isn't. This is likely because you still need to be able to insert a dummy "copy-from-last-offset" operation.
cur_command_.length = 0; cur_command_.length = 0;
cur_command_.offset = lastOffset_; cur_command_.offset = last_offset_;
} }
else else
{ {
cur_command_.cmdType = ZX0OperationType::LITERAL_BLOCK; cur_command_.cmd_type = ZX0OperationType::LITERAL_BLOCK;
cur_command_.length = 0; cur_command_.length = 0;
} }
cur_command_.length += read_elias_gamma(reader_); cur_command_.length += read_elias_gamma(reader_);
cur_command_.bytePos = 0; cur_command_.byte_pos = 0;
} }
uint32_t ZX0Decompressor::copy_block(uint32_t numBytes) IWRAM_CODE uint32_t ZX0Decompressor::copy_block(uint32_t num_bytes)
{ {
const uint32_t available = cur_command_.length - cur_command_.bytePos; const uint32_t available = cur_command_.length - cur_command_.byte_pos;
const uint32_t bytesToRead = (numBytes > available) ? available : numBytes; const uint32_t bytes_to_read = (num_bytes > available) ? available : num_bytes;
uint32_t bytesRemaining = bytesToRead; uint32_t bytes_remaining = bytes_to_read;
if(cur_command_.cmdType == ZX0OperationType::LITERAL_BLOCK) if(cur_command_.cmd_type == ZX0OperationType::LITERAL_BLOCK)
{ {
// Literal copy // Literal copy
// Align cur_out first // Align cur_out first
while (bytesRemaining && ((uintptr_t)cur_out & 3)) while (bytes_remaining && ((uintptr_t)cur_out & 3))
{ {
(*cur_out++) = reader_.read(8); (*cur_out++) = reader_.read(8);
bytesRemaining--; bytes_remaining--;
} }
// Use bulk 32-bit writes when aligned // Use bulk 32-bit writes when aligned
while (bytesRemaining >= 4) while (bytes_remaining >= 4)
{ {
// we need to swap again, because the data was originally stored in big endian format
// BitReader converted it to little endian format to make reading easier.
// and now we need to convert it back to big endian format.
*(uint32_t*)cur_out = __builtin_bswap32(reader_.read(32)); *(uint32_t*)cur_out = __builtin_bswap32(reader_.read(32));
cur_out += 4; cur_out += 4;
bytesRemaining -= 4; bytes_remaining -= 4;
} }
// Handle remaining bytes // Handle remaining bytes
while (bytesRemaining--) while (bytes_remaining--)
{ {
(*cur_out++) = reader_.read(8); (*cur_out++) = reader_.read(8);
} }
} }
else else
{ {
if(!cur_command_.bytePos) if(!cur_command_.byte_pos)
{ {
back_pos_ = cur_out - cur_command_.offset; back_pos_ = cur_out - cur_command_.offset;
} }
// try to get cur_out and back_pos aligned to 32 bit accesses first // try to get cur_out and back_pos aligned to 32 bit accesses first
while (bytesRemaining && (((uintptr_t)cur_out & 3) || ((uintptr_t)back_pos_ & 3))) while (bytes_remaining && (((uintptr_t)cur_out & 3) || ((uintptr_t)back_pos_ & 3)))
{ {
(*cur_out++) = (*back_pos_++); (*cur_out++) = (*back_pos_++);
bytesRemaining--; bytes_remaining--;
} }
// now try bulk 32 bit writes // now try bulk 32 bit writes
while(bytesRemaining >= 4) while(bytes_remaining >= 4)
{ {
// these don't need to be byteswapped, because the data is being read with the same endianness as it is being written.
// this is different when reading from BitReader.
*(uint32_t*)cur_out = *((uint32_t*)back_pos_); *(uint32_t*)cur_out = *((uint32_t*)back_pos_);
cur_out += 4; cur_out += 4;
back_pos_ += 4; back_pos_ += 4;
bytesRemaining -= 4; bytes_remaining -= 4;
} }
while(bytesRemaining--) while(bytes_remaining--)
{ {
(*cur_out++) = (*back_pos_++); (*cur_out++) = (*back_pos_++);
} }
} }
cur_command_.bytePos += bytesToRead; cur_command_.byte_pos += bytes_to_read;
return bytesToRead; return bytes_to_read;
} }
} // gets stored in .bss, and therefore will end up in IWRAM by default
__attribute__((section(".iwram")))
static ZX0Decompressor decompressor; static ZX0Decompressor decompressor;
extern "C" extern "C"
@ -321,11 +297,11 @@ void zx0_decompressor_start(uint8_t *output_buffer, const uint8_t *input_data)
uint32_t zx0_decompressor_get_decompressed_size() uint32_t zx0_decompressor_get_decompressed_size()
{ {
return decompressor.getDecompressedSize(); return decompressor.get_decompressed_size();
} }
uint32_t zx0_decompressor_read(uint32_t num_bytes) void zx0_decompressor_read(uint32_t num_bytes)
{ {
return decompressor.read(num_bytes); decompressor.read(num_bytes);
} }
} }