diff --git a/Makefile b/Makefile index c457bef..24bac82 100644 --- a/Makefile +++ b/Makefile @@ -170,6 +170,7 @@ BINFILES := $(foreach dir,../$(DATA),$(notdir $(wildcard $(dir)/*.*))) export OFILES_BIN := $(addsuffix .o,$(BINFILES)) OFILES += $(OFILES_BIN) +# Optimize zx0_decompressor for speed zx0_decompressor.o: CXXFLAGS += -O2 #--------------------------------------------------------------------------------- diff --git a/include/zx0_decompressor.h b/include/zx0_decompressor.h index bdf9d82..9409637 100644 --- a/include/zx0_decompressor.h +++ b/include/zx0_decompressor.h @@ -9,10 +9,6 @@ // Original implementation can be found here: https://github.com/einar-saukas/ZX0 // However, we've implemented a custom variant of this algorithm. // (for instance: we're storing the uncompressed size in the first 2 bytes in little endian) - -// Our implementation "streams" the decompression: the decompression buffer is only 2 KB, so it can't fit the entire -// uncompressed file at once. Therefore it uses a ringbuffer to stream the decompression on-demand. - extern "C" { /** @@ -31,7 +27,7 @@ extern "C" * @brief This function copies of decompressed data into the specified * It will trigger decompression on the go (streaming basis) */ - uint32_t zx0_decompressor_read(uint32_t num_bytes); + void zx0_decompressor_read(uint32_t num_bytes); } #endif \ No newline at end of file diff --git a/source/zx0_decompressor.cpp b/source/zx0_decompressor.cpp index 86c7c0f..7ec717a 100644 --- a/source/zx0_decompressor.cpp +++ b/source/zx0_decompressor.cpp @@ -1,13 +1,11 @@ #include "zx0_decompressor.h" #include +#include // The following code is a custom implementation of the ZX0 decompression algorithm invented by Einar Saukas // Original implementation can be found here: https://github.com/einar-saukas/ZX0 -// It uses classes, but keeps them completely hidden in the .cpp file with an anonymous namespace for internal linkage. // The header provides a C facade to access the relevant methods, but the rest of Poke Transporter GB // doesn't need to be aware of all the datatypes/classes defined here. -namespace -{ /** * This class makes reading on a per-bit basis much easier. */ @@ -16,14 +14,12 @@ class BitReader public: BitReader(const uint8_t* buffer); - uint32_t readBit(); - uint32_t read(uint32_t numBits); + IWRAM_CODE uint32_t read(uint32_t numBits); protected: private: - const uint8_t* buffer_; - const uint8_t* curBuffer_; - uint32_t currentDWord_; - uint32_t bitsLeft_; + const uint8_t* cur_buffer_; + uint32_t cur_dword_; + uint32_t bits_left_; }; enum class ZX0OperationType @@ -36,10 +32,10 @@ enum class ZX0OperationType typedef struct ZX0Command { - ZX0OperationType cmdType; + ZX0OperationType cmd_type; uint32_t length; uint32_t offset; - uint32_t bytePos; + uint32_t byte_pos; } ZX0Command; /** @@ -62,50 +58,43 @@ public: * @brief Retrieves the size of the data when it is fully decompressed * This is read from the first 2 bytes of the inputData */ - uint32_t getDecompressedSize() const; + IWRAM_CODE uint32_t get_decompressed_size() const; /** * @brief This function reads of data into */ - uint32_t read(uint32_t numBytes); + IWRAM_CODE void read(uint32_t num_bytes); protected: private: - void readNextCommand(); - uint32_t copy_block(uint32_t numBytes); + IWRAM_CODE void read_next_command(); + IWRAM_CODE uint32_t copy_block(uint32_t num_bytes); BitReader reader_; ZX0Command cur_command_; - const uint8_t *inputData_; + const uint8_t *input_data_; uint8_t *back_pos_; uint8_t *cur_out; - uint32_t bytesDecompressed_; - uint32_t lastOffset_; + uint32_t last_offset_; }; -static inline uint32_t read_elias_gamma(BitReader& reader) +IWRAM_CODE static inline uint32_t read_elias_gamma(BitReader& reader) { uint32_t num_non_leading_bits = 0; uint32_t value; - while (!reader.readBit()) + while (!reader.read(1)) { ++num_non_leading_bits; // Count leading zeros } // reconstruct the most significant bit of value - value = 1 << num_non_leading_bits; // Start with MSB - - // now apply the binary part (the actual value) - while(num_non_leading_bits--) - { - value |= (reader.readBit() << num_non_leading_bits); - } + value = (1 << num_non_leading_bits) | reader.read(num_non_leading_bits); // Start with MSB // Adjust back to zero-based return value - 1; } -static inline void read_new_offset(BitReader& reader, uint32_t& offset) +IWRAM_CODE static inline void read_new_offset(BitReader& reader, uint32_t& offset) { - const uint32_t has_msb = reader.readBit(); + const uint32_t has_msb = reader.read(1); const uint32_t lsb = reader.read(7); const uint32_t msb = (has_msb) ? read_elias_gamma(reader) : 0; @@ -114,45 +103,39 @@ static inline void read_new_offset(BitReader& reader, uint32_t& offset) } BitReader::BitReader(const uint8_t* buffer) - : buffer_(buffer) - , curBuffer_(buffer) - , currentDWord_(0) - , bitsLeft_(0) + : cur_buffer_(buffer) + , cur_dword_(0) + , bits_left_(0) { } -inline uint32_t BitReader::readBit() +IWRAM_CODE inline uint32_t BitReader::read(uint32_t num_bits) { - return read(1); -} - -inline uint32_t BitReader::read(uint32_t numBits) -{ - uint32_t result = 0; + uint32_t result; // Fast path: Read all bits from cached data - if (numBits <= bitsLeft_) + if (num_bits <= bits_left_) { - result = (currentDWord_ >> (bitsLeft_ - numBits)) & ((1 << numBits) - 1); - bitsLeft_ -= numBits; + result = (cur_dword_ >> (bits_left_ - num_bits)) & ((1 << num_bits) - 1); + bits_left_ -= num_bits; return result; } // Slow path: Refill cache and combine bits - result = currentDWord_ & ((1 << bitsLeft_) - 1); - numBits -= bitsLeft_; + result = cur_dword_ & ((1 << bits_left_) - 1); + num_bits -= bits_left_; // Refill cache (32-bit aligned read) // but the GBA (or x86 processor on pc) would read the value as little endian. // and we need it as big endian. Therefore we do a byte swap - currentDWord_ = __builtin_bswap32(*(uint32_t*)curBuffer_); + cur_dword_ = __builtin_bswap32(*(uint32_t*)cur_buffer_); - curBuffer_ += sizeof(uint32_t); - bitsLeft_ = 32; + cur_buffer_ += sizeof(uint32_t); + bits_left_ = 32; // Combine remaining bits - result = (result << numBits) | (currentDWord_ >> (32 - numBits)); - bitsLeft_ -= numBits; + result = (result << num_bits) | (cur_dword_ >> (32 - num_bits)); + bits_left_ -= num_bits; return result; } @@ -160,11 +143,10 @@ inline uint32_t BitReader::read(uint32_t numBits) ZX0Decompressor::ZX0Decompressor() : reader_(nullptr) , cur_command_({ZX0OperationType::NONE, 0, 0, 0}) - , inputData_(nullptr) + , input_data_(nullptr) , back_pos_(nullptr) , cur_out(nullptr) - , bytesDecompressed_(0) - , lastOffset_(UINT32_MAX) + , last_offset_(UINT32_MAX) { } @@ -172,144 +154,138 @@ void ZX0Decompressor::start(uint8_t *output_buffer, const uint8_t *input_data) { reader_ = BitReader(input_data + 4); cur_command_ = {ZX0OperationType::NONE, 0, 0, 0}; - inputData_ = input_data; + input_data_ = input_data; back_pos_ = nullptr; cur_out = output_buffer; - bytesDecompressed_ = 0; - lastOffset_ = UINT32_MAX; + last_offset_ = UINT32_MAX; } -uint32_t ZX0Decompressor::getDecompressedSize() const +IWRAM_CODE uint32_t ZX0Decompressor::get_decompressed_size() const { - if(!inputData_) + if(!input_data_) { return 0; } - return *((uint32_t*)inputData_); + return *((uint32_t*)input_data_); } -uint32_t ZX0Decompressor::read(uint32_t numBytes) +IWRAM_CODE void ZX0Decompressor::read(uint32_t num_bytes) { - const uint32_t decompressed_size = getDecompressedSize(); - const uint32_t bytesDecompressedBefore = bytesDecompressed_; - uint32_t bytesRead; - - while(numBytes && bytesDecompressed_ < decompressed_size) + while(num_bytes) { // Check if we have finished processing the previous pending command // if we have, we need to read a new operation - if(cur_command_.cmdType == ZX0OperationType::NONE || cur_command_.bytePos >= cur_command_.length) + if(cur_command_.byte_pos >= cur_command_.length) { - readNextCommand(); + read_next_command(); } - bytesRead = copy_block(numBytes); - numBytes -= bytesRead; - bytesDecompressed_ += bytesRead; + const uint32_t bytes_read = copy_block(num_bytes); + num_bytes -= bytes_read; } - - return bytesDecompressed_ - bytesDecompressedBefore; } -inline void ZX0Decompressor::readNextCommand() +IWRAM_CODE inline void ZX0Decompressor::read_next_command() { - const uint32_t cmdBit = reader_.readBit(); + const uint32_t cmd_bit = reader_.read(1); // the "COPY_NEW_OFFSET" command adds + 1 to the length, but the other commands don't. // given that read_elias_gamma() function is marked "inline", the way I set the length // is to avoid having multiple calls to it here. (for code size) - if(cmdBit) + if(cmd_bit) { - read_new_offset(reader_, lastOffset_); - cur_command_.cmdType = ZX0OperationType::COPY_NEW_OFFSET; + read_new_offset(reader_, last_offset_); + cur_command_.cmd_type = ZX0OperationType::COPY_NEW_OFFSET; cur_command_.length = 1; - cur_command_.offset = lastOffset_; - cur_command_.bytePos = 0; + cur_command_.offset = last_offset_; } - else if(cur_command_.cmdType == ZX0OperationType::LITERAL_BLOCK) + else if(cur_command_.cmd_type == ZX0OperationType::LITERAL_BLOCK) { - cur_command_.cmdType = ZX0OperationType::COPY_LAST_OFFSET; + cur_command_.cmd_type = ZX0OperationType::COPY_LAST_OFFSET; // copy from new offset and last offset differs in the sense that with the new offset the encoded length is reduced by one // and for last offset it isn't. This is likely because you still need to be able to insert a dummy "copy-from-last-offset" operation. cur_command_.length = 0; - cur_command_.offset = lastOffset_; + cur_command_.offset = last_offset_; } else { - cur_command_.cmdType = ZX0OperationType::LITERAL_BLOCK; + cur_command_.cmd_type = ZX0OperationType::LITERAL_BLOCK; cur_command_.length = 0; } cur_command_.length += read_elias_gamma(reader_); - cur_command_.bytePos = 0; + cur_command_.byte_pos = 0; } -uint32_t ZX0Decompressor::copy_block(uint32_t numBytes) +IWRAM_CODE uint32_t ZX0Decompressor::copy_block(uint32_t num_bytes) { - const uint32_t available = cur_command_.length - cur_command_.bytePos; - const uint32_t bytesToRead = (numBytes > available) ? available : numBytes; - uint32_t bytesRemaining = bytesToRead; + const uint32_t available = cur_command_.length - cur_command_.byte_pos; + const uint32_t bytes_to_read = (num_bytes > available) ? available : num_bytes; + uint32_t bytes_remaining = bytes_to_read; - if(cur_command_.cmdType == ZX0OperationType::LITERAL_BLOCK) + if(cur_command_.cmd_type == ZX0OperationType::LITERAL_BLOCK) { // Literal copy // Align cur_out first - while (bytesRemaining && ((uintptr_t)cur_out & 3)) + while (bytes_remaining && ((uintptr_t)cur_out & 3)) { (*cur_out++) = reader_.read(8); - bytesRemaining--; + bytes_remaining--; } // Use bulk 32-bit writes when aligned - while (bytesRemaining >= 4) + while (bytes_remaining >= 4) { + // we need to swap again, because the data was originally stored in big endian format + // BitReader converted it to little endian format to make reading easier. + // and now we need to convert it back to big endian format. *(uint32_t*)cur_out = __builtin_bswap32(reader_.read(32)); cur_out += 4; - bytesRemaining -= 4; + bytes_remaining -= 4; } // Handle remaining bytes - while (bytesRemaining--) + while (bytes_remaining--) { (*cur_out++) = reader_.read(8); } } else { - if(!cur_command_.bytePos) + if(!cur_command_.byte_pos) { back_pos_ = cur_out - cur_command_.offset; } // try to get cur_out and back_pos aligned to 32 bit accesses first - while (bytesRemaining && (((uintptr_t)cur_out & 3) || ((uintptr_t)back_pos_ & 3))) + while (bytes_remaining && (((uintptr_t)cur_out & 3) || ((uintptr_t)back_pos_ & 3))) { (*cur_out++) = (*back_pos_++); - bytesRemaining--; + bytes_remaining--; } // now try bulk 32 bit writes - while(bytesRemaining >= 4) + while(bytes_remaining >= 4) { + // these don't need to be byteswapped, because the data is being read with the same endianness as it is being written. + // this is different when reading from BitReader. *(uint32_t*)cur_out = *((uint32_t*)back_pos_); cur_out += 4; back_pos_ += 4; - bytesRemaining -= 4; + bytes_remaining -= 4; } - while(bytesRemaining--) + while(bytes_remaining--) { (*cur_out++) = (*back_pos_++); } } - cur_command_.bytePos += bytesToRead; + cur_command_.byte_pos += bytes_to_read; - return bytesToRead; + return bytes_to_read; } -} - -__attribute__((section(".iwram"))) +// gets stored in .bss, and therefore will end up in IWRAM by default static ZX0Decompressor decompressor; extern "C" @@ -321,11 +297,11 @@ void zx0_decompressor_start(uint8_t *output_buffer, const uint8_t *input_data) uint32_t zx0_decompressor_get_decompressed_size() { - return decompressor.getDecompressedSize(); + return decompressor.get_decompressed_size(); } -uint32_t zx0_decompressor_read(uint32_t num_bytes) +void zx0_decompressor_read(uint32_t num_bytes) { - return decompressor.read(num_bytes); + decompressor.read(num_bytes); } } \ No newline at end of file