Poke_Transporter_GB/source/zx0_decompressor.cpp
Philippe Symons 26fd1e2dd3 Add compression for the text data, output stack usage .su files and rework script_array
Add a binary table format and convert the text entries into this format in text_helper/main.py. It then gets compressed with zx0.

The new text_data_table and streamed_data_table classes exist to read the various entries from this binary table. streamed_data_table specifically
exists to use a decompression buffer that is smaller than the actual binary table. But it requires a decompression buffer that is
still larger than ZX0_DEFAULT_WINDOW_SIZE (default: 2048 bytes) and will only be able to decompress in
chunks of (<decompression_buffer_size> - <ZX0_DEFAULT_WINDOW_SIZE>) bytes

Try to keep the binary text tables sufficiently small though, because since zx0 doesn't actually support random access,
getting to the last entry is significantly more expensive than reading the first one. And unless you use streamed_data_table,
it also requires <uncompressed_size> bytes of stack space, therefore IWRAM to decompress them.

I also had to rework script_array because it can no longer reference the strings directly. Instead we now reference the DIA_* "enum" values.
We also no longer store an array of script_obj instances, because these were getting stored in IWRAM since they're non-const global variables
originally. Instead we now have const arrays of script_obj_params structs, which should end up in .rodata -> therefore EWRAM.

Right now, script_obj only supports the PTGB text table (originally the dialogue array). But if the need arises to support other tables as well,
I'd consider adding a separate enum to script_obj_params to indicate the specific table.

The compilation process will also output .su files in the build folder from now on. These files indicate the stack frame size for every function in
every compilation unit, so be sure to check them from time to time. Note that they will only show the stack consumption for that specific function.
So to get the worst case stack consumption, you need to manually add all the functions in a certain stack flow.
2025-05-21 12:21:06 +02:00

326 lines
9.1 KiB
C++

#include "zx0_decompressor.h"
#include <cstring>
#include <tonc.h>
// The following code is a custom implementation of the ZX0 decompression algorithm invented by Einar Saukas
// Original implementation can be found here: https://github.com/einar-saukas/ZX0
// The header provides a C facade to access the relevant methods, but the rest of Poke Transporter GB
// doesn't need to be aware of all the datatypes/classes defined here.
/**
* This class makes reading on a per-bit basis much easier.
*/
class BitReader
{
public:
BitReader(const uint8_t* buffer);
IWRAM_CODE uint32_t read(uint32_t numBits);
protected:
private:
const uint8_t* cur_buffer_;
uint32_t cur_dword_;
uint32_t bits_left_;
};
enum class ZX0OperationType
{
NONE,
LITERAL_BLOCK,
COPY_LAST_OFFSET,
COPY_NEW_OFFSET
};
typedef struct ZX0Command
{
ZX0OperationType cmd_type;
uint32_t length;
uint32_t offset;
uint32_t byte_pos;
} ZX0Command;
/**
* @brief This class implements the actual ZX0 decompression.
*/
class ZX0Decompressor
{
public:
ZX0Decompressor();
/**
* @brief This function prepares the ZX0Decompressor instance
* for decompressing the specified inputData
* into the specified output_buffer
* @param compressedData
*/
void start(uint8_t *output_buffer, const uint8_t *input_data);
/**
* @brief Retrieves the size of the data when it is fully decompressed
* This is read from the first 2 bytes of the inputData
*/
IWRAM_CODE uint32_t get_decompressed_size() const;
/**
* @brief This function reads <numBytes> of data into <outputBuffer>
*/
IWRAM_CODE void read(uint32_t num_bytes);
/**
* @brief This function swaps out the current output buffer for the given one
*/
IWRAM_CODE void swap_output_buffer(uint8_t *new_output_buffer);
protected:
private:
IWRAM_CODE void read_next_command();
IWRAM_CODE uint32_t copy_block(uint32_t num_bytes);
BitReader reader_;
ZX0Command cur_command_;
const uint8_t *input_data_;
uint8_t *back_pos_;
uint8_t *cur_out_;
uint32_t last_offset_;
};
IWRAM_CODE static inline uint32_t read_elias_gamma(BitReader& reader)
{
uint32_t num_non_leading_bits = 0;
uint32_t value;
while (!reader.read(1))
{
++num_non_leading_bits; // Count leading zeros
}
// reconstruct the most significant bit of value
value = (1 << num_non_leading_bits) | reader.read(num_non_leading_bits); // Start with MSB
// Adjust back to zero-based
return value - 1;
}
IWRAM_CODE static inline void read_new_offset(BitReader& reader, uint32_t& offset)
{
const uint32_t has_msb = reader.read(1);
const uint32_t lsb = reader.read(7);
const uint32_t msb = (has_msb) ? read_elias_gamma(reader) : 0;
offset = ((msb << 7) | lsb) + 1;
}
BitReader::BitReader(const uint8_t* buffer)
: cur_buffer_(buffer)
, cur_dword_(0)
, bits_left_(0)
{
}
IWRAM_CODE inline uint32_t BitReader::read(uint32_t num_bits)
{
uint32_t result;
// Fast path: Read all bits from cached data
if (num_bits <= bits_left_)
{
result = (cur_dword_ >> (bits_left_ - num_bits)) & ((1 << num_bits) - 1);
bits_left_ -= num_bits;
return result;
}
// Slow path: Refill cache and combine bits
result = cur_dword_ & ((1 << bits_left_) - 1);
num_bits -= bits_left_;
// Refill cache (32-bit aligned read)
// but the GBA (or x86 processor on pc) would read the value as little endian.
// and we need it as big endian. Therefore we do a byte swap
cur_dword_ = __builtin_bswap32(*(uint32_t*)cur_buffer_);
cur_buffer_ += sizeof(uint32_t);
bits_left_ = 32;
// Combine remaining bits
result = (result << num_bits) | (cur_dword_ >> (32 - num_bits));
bits_left_ -= num_bits;
return result;
}
ZX0Decompressor::ZX0Decompressor()
: reader_(nullptr)
, cur_command_({ZX0OperationType::NONE, 0, 0, 0})
, input_data_(nullptr)
, back_pos_(nullptr)
, cur_out_(nullptr)
, last_offset_(UINT32_MAX)
{
}
void ZX0Decompressor::start(uint8_t *output_buffer, const uint8_t *input_data)
{
reader_ = BitReader(input_data + 4);
cur_command_ = {ZX0OperationType::NONE, 0, 0, 0};
input_data_ = input_data;
back_pos_ = nullptr;
cur_out_ = output_buffer;
last_offset_ = UINT32_MAX;
}
IWRAM_CODE uint32_t ZX0Decompressor::get_decompressed_size() const
{
if(!input_data_)
{
return 0;
}
return *((uint32_t*)input_data_);
}
IWRAM_CODE void ZX0Decompressor::read(uint32_t num_bytes)
{
while(num_bytes)
{
// Check if we have finished processing the previous pending command
// if we have, we need to read a new operation
if(cur_command_.byte_pos >= cur_command_.length)
{
read_next_command();
}
const uint32_t bytes_read = copy_block(num_bytes);
num_bytes -= bytes_read;
}
}
IWRAM_CODE void ZX0Decompressor::swap_output_buffer(uint8_t *new_output_buffer)
{
const uint32_t current_offset = cur_out_ - back_pos_;
cur_out_ = new_output_buffer;
back_pos_ = new_output_buffer - current_offset;
}
IWRAM_CODE inline void ZX0Decompressor::read_next_command()
{
const uint32_t cmd_bit = reader_.read(1);
// the "COPY_NEW_OFFSET" command adds + 1 to the length, but the other commands don't.
// given that read_elias_gamma() function is marked "inline", the way I set the length
// is to avoid having multiple calls to it here. (for code size)
if(cmd_bit)
{
read_new_offset(reader_, last_offset_);
cur_command_.cmd_type = ZX0OperationType::COPY_NEW_OFFSET;
cur_command_.length = 1;
cur_command_.offset = last_offset_;
}
else if(cur_command_.cmd_type == ZX0OperationType::LITERAL_BLOCK)
{
cur_command_.cmd_type = ZX0OperationType::COPY_LAST_OFFSET;
// copy from new offset and last offset differs in the sense that with the new offset the encoded length is reduced by one
// and for last offset it isn't. This is likely because you still need to be able to insert a dummy "copy-from-last-offset" operation.
cur_command_.length = 0;
cur_command_.offset = last_offset_;
}
else
{
cur_command_.cmd_type = ZX0OperationType::LITERAL_BLOCK;
cur_command_.length = 0;
}
cur_command_.length += read_elias_gamma(reader_);
cur_command_.byte_pos = 0;
}
IWRAM_CODE uint32_t ZX0Decompressor::copy_block(uint32_t num_bytes)
{
const uint32_t available = cur_command_.length - cur_command_.byte_pos;
const uint32_t bytes_to_read = (num_bytes > available) ? available : num_bytes;
uint32_t bytes_remaining = bytes_to_read;
if(cur_command_.cmd_type == ZX0OperationType::LITERAL_BLOCK)
{
// Literal copy
// Align cur_out_ first
while (bytes_remaining && ((uintptr_t)cur_out_ & 3))
{
(*cur_out_++) = reader_.read(8);
bytes_remaining--;
}
// Use bulk 32-bit writes when aligned
while (bytes_remaining >= 4)
{
// we need to swap again, because the data was originally stored in big endian format
// BitReader converted it to little endian format to make reading easier.
// and now we need to convert it back to big endian format.
*(uint32_t*)cur_out_ = __builtin_bswap32(reader_.read(32));
cur_out_ += 4;
bytes_remaining -= 4;
}
// Handle remaining bytes
while (bytes_remaining--)
{
(*cur_out_++) = reader_.read(8);
}
}
else
{
if(!cur_command_.byte_pos)
{
back_pos_ = cur_out_ - cur_command_.offset;
}
// try to get cur_out_ and back_pos aligned to 32 bit accesses first
while (bytes_remaining && (((uintptr_t)cur_out_ & 3) || ((uintptr_t)back_pos_ & 3)))
{
(*cur_out_++) = (*back_pos_++);
bytes_remaining--;
}
// now try bulk 32 bit writes
while(bytes_remaining >= 4)
{
// these don't need to be byteswapped, because the data is being read with the same endianness as it is being written.
// this is different when reading from BitReader.
*(uint32_t*)cur_out_ = *((uint32_t*)back_pos_);
cur_out_ += 4;
back_pos_ += 4;
bytes_remaining -= 4;
}
while(bytes_remaining--)
{
(*cur_out_++) = (*back_pos_++);
}
}
cur_command_.byte_pos += bytes_to_read;
return bytes_to_read;
}
// gets stored in .bss, and therefore will end up in IWRAM by default
static ZX0Decompressor decompressor;
extern "C"
{
void zx0_decompressor_start(uint8_t *output_buffer, const uint8_t *input_data)
{
decompressor.start(output_buffer, input_data);
}
uint32_t zx0_decompressor_get_decompressed_size()
{
return decompressor.get_decompressed_size();
}
void zx0_decompressor_read(uint32_t num_bytes)
{
decompressor.read(num_bytes);
}
void zx0_decompressor_read_partial(uint8_t *output_buffer, uint16_t num_bytes)
{
decompressor.swap_output_buffer(output_buffer);
decompressor.read(num_bytes);
}
}