Significantly improve speed of LZ77 decompression for NDS build

2026-03-21 17:24:39 -05:00 · 2024-03-13 17:36:09 +01:00 · 2024-03-13 17:36:09 +01:00 · 6730a50457
commit 6730a50457
parent 23510710dd
3 changed files with 180 additions and 1 deletions
--- a/include/base_include.h
+++ b/include/base_include.h
@ -40,6 +40,7 @@ ALWAYS_INLINE MAX_OPTIMIZE int __get_next_vcount_interrupt(void) {
 // NDS defines and all
 #include <nds.h>
 #include "useful_qualifiers.h"
+#include "decompress.h"
 #define SCANLINES 0x107
 #define ROM GBAROM
 ALWAYS_INLINE MAX_OPTIMIZE int __get_next_vcount_interrupt(void) {
@ -84,7 +85,7 @@ typedef struct {
 #define Div(x, y) swiDivide(x, y)
 #define DivMod(x, y) swiRemainder(x, y)
 #define Sqrt(x) swiSqrt(x)
-#define LZ77UnCompWram(x, y) swiDecompressLZSSWram(x, y)
+#define LZ77UnCompWram(x, y) swi_LZ77UnCompWrite8bit(x, y)
 #define CpuFastSet(x, y, z) swiFastCopy(x, y, z)
 #define VBlankIntrWait() swiWaitForVBlank()
 #define Halt() CP15_WaitForInterrupt()
--- a/include/decompress.h
+++ b/include/decompress.h
@ -0,0 +1 @@
+#ifndef _DCMPRSS_HEADER_
#define _DCMPRSS_HEADER_

extern int swi_LZ77UnCompWrite16bit(const void *source, void *destination);
extern int swi_LZ77UnCompWrite8bit(const void *source, void *destination);

#endif
--- a/source/decompress.s
+++ b/source/decompress.s
@ -0,0 +1,177 @@
+@-------------------------------------------------------------
+@ Original code by fleroviux, distributed under the MIT License:
+@ Link: https://github.com/Cult-of-GBA/BIOS
+@ Copyright 2020 - 2021 DenSinH fleroviux
+@ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+@ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+@-------------------------------------------------------------
+@ Speed optimizations by Lorenzooone - 2024
+@ 8bit version not included because the optimized 16bit version is faster.
+@ Works fine even with 1 as the window offset (fixes LZ77 VRAM bug).
+@-------------------------------------------------------------
+
+ .arm
+ .align
+ .global swi_LZ77UnCompWrite16bit
+ .global swi_LZ77UnCompWrite8bit
+ .type swi_LZ77UnCompWrite16bit,function
+ .type swi_LZ77UnCompWrite8bit,function
+#ifdef __GBA__
+.section .iwram, "ax", %progbits
+#endif
+
+swi_LZ77UnCompWrite16bit:
+swi_LZ77UnCompWrite8bit:
+    stmfd sp!, {r3 - r7}
+
+    @ Read header word:
+    @ bit0-3:  reserved
+    @ bit4-7:  compressed type (1 for LZ77)
+    @ bit8-31: size of compressed data
+    ldr r2, [r0], #4
+    lsrs r2, r2, #8
+    @ ignore zero-length decompression requests
+    beq .lz77_16bit_done
+
+.lz77_16bit_loop:
+    @ read encoder byte, shift to MSB for easier access.
+    ldrb r3, [r0], #1
+    orr r3, #0x01000000
+.lz77_16bit_encoder_loop:
+    tst r3, #0x80
+    bne .lz77_16bit_copy_window
+.lz77_16bit_copy_byte:
+    @ copy byte from current source to current destination
+    ldrb r4, [r0], #1
+    tst r1, #1
+    moveq r7, r4
+    orrne r7, r4, lsl #8
+    strneh r7, [r1]
+    add r1, #1
+
+    @ check if decompressed length has been reached.
+    subs r2, #1
+    beq .lz77_16bit_done
+
+    @ read next encoder or process next block
+    lsls r3, #1
+    bcc .lz77_16bit_encoder_loop
+    b .lz77_16bit_loop
+.lz77_16bit_copy_window:
+    @ read window tuple {displacement, size}
+    ldrb r4, [r0], #1
+    ldrb r5, [r0], #1
+
+    @ r5 = window displacement
+    orr r5, r5, r4, lsl #8
+    bic r5, #0xF000
+    add r5, #1
+
+    @ r4 = window size
+    lsr r4, #4
+    add r4, #3
+    cmp r4, r2
+    movgt r4, r2
+    subs r2, r4
+    subs r5, r1, r5
+    @ About 50% of the time, the two addresses will be aligned. Abuse this...
+    eor r6, r1, r5
+    tst r6, #1
+    bne .lz77_16bit_copy_window_not_aligned_check
+    tst r1, #1
+    beq .lz77_optimized_16bit_copy_window_loop
+    ldrb r6, [r5], #1
+    orr r7, r6, lsl #8
+    strh r7, [r1], #1
+    subs r4, #1
+.lz77_optimized_16bit_copy_window_loop:
+    ldrh r6, [r5], #2
+    strh r6, [r1], #2
+    subs r4, #2
+    cmp r4, #1
+    bgt .lz77_optimized_16bit_copy_window_loop
+    bne .aligned_window_end_check
+    @ copy byte from window to current destination
+    ldrb r7, [r5]
+    add r1, #1
+
+.aligned_window_end_check:
+    @ check if decompressed length has been reached
+    cmp r2, #0
+    beq .lz77_16bit_done
+
+    @ read next encoder or process next block
+    lsls r3, #1
+    bcc .lz77_16bit_encoder_loop
+    b .lz77_16bit_loop
+
+.lz77_16bit_copy_window_not_aligned_check:
+    sub r1, #1
+    cmp r1, r5
+    beq .set_previous_byte
+    tst r5, #1
+    beq .lz77_16bit_copy_window_not_aligned_loop
+    ldrb r7, [r5], #1
+    add r1, #1
+    subs r4, #1
+.lz77_16bit_copy_window_not_aligned_loop:
+    ldrh r6, [r5], #2
+    orr r7, r6, lsl #8
+    strh r7, [r1], #2
+    lsr r7, #16
+    subs r4, #2
+    cmp r4, #1
+    bgt .lz77_16bit_copy_window_not_aligned_loop
+    bne .window_end_check
+    @ copy byte from window to current destination
+    ldrb r6, [r5]
+    orr r7, r6, lsl #8
+    strh r7, [r1], #1
+
+.window_end_check:
+    add r1, #1
+    @ check if decompressed length has been reached
+    cmp r2, #0
+    beq .lz77_16bit_done
+
+    @ read next encoder or process next block
+    lsls r3, #1
+    bcc .lz77_16bit_encoder_loop
+    b .lz77_16bit_loop
+
+.set_previous_byte:
+    tst r1, #1
+    beq .got_byte_in_r7
+    ldrb r7, [r1], #1
+    movs r6, r7
+    orr r7, r6, lsl #8
+    b .set_loop
+.got_byte_in_r7:
+    movs r6, r7
+    orr r7, r6, lsl #8
+    strh r7, [r1], #2
+    subs r4, #1
+.set_loop:
+    strh r7, [r1], #2
+    subs r4, #2
+    cmp r4, #1
+    bgt .set_loop
+    bne .set_previous_byte_end_check
+    @ copy byte from window to current destination
+    lsr r7, #8
+    add r1, #1
+
+.set_previous_byte_end_check:
+    @ check if decompressed length has been reached
+    cmp r2, #0
+    beq .lz77_16bit_done
+
+    @ read next encoder or process next block
+    lsls r3, #1
+    bcc .lz77_16bit_encoder_loop
+    b .lz77_16bit_loop
+
+.lz77_16bit_done:
+    ldmfd sp!, {r3 - r7}
+    bx lr
				`@ -0,0 +1 @@`
				`#ifndef _DCMPRSS_HEADER_ #define _DCMPRSS_HEADER_ extern int swi_LZ77UnCompWrite16bit(const void source, void destination); extern int swi_LZ77UnCompWrite8bit(const void source, void destination); #endif`