Rewrite this one to Arlib API, and wipe out moremem

2026-04-17 15:25:59 -05:00 · 2016-12-20 22:19:58 +01:00 · 2016-12-20 22:19:58 +01:00 · 0e31df85fd
commit 0e31df85fd
parent 8d1780bb29
6 changed files with 101 additions and 111 deletions
--- a/4
+++ b/4
@ -4,4 +4,8 @@ ARWUTF = 1

 EXTRAOBJ += obj/divsufsort-c$(OBJSUFFIX).o
 SOURCES += patch/*.cpp
+
+DOMAINS += LDSS
+SOURCES_LDSS := libdivsufsort-2.0.1/lib/divsufsort.c libdivsufsort-2.0.1/lib/sssort.c libdivsufsort-2.0.1/lib/trsort.c
+CFLAGS_LDSS := -Ilibdivsufsort-2.0.1/include -DHAVE_CONFIG_H
 include arlib/Makefile
--- a/patch/bps-create.cppx
+++ b/patch/bps-create.cppx
@ -1,10 +1,4 @@
-#include "libbps.h"
-#include "arlib/crc32.h"
-#include "arlib/file.h"
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
+#include "patch.h"

 //These two give minor performance penalties and will print some random stuff to stdout.
 //The former will verify the correctness of the output patch, the latter will print some performance data.
@ -19,10 +13,6 @@
 //If it's something else, get a non-broken array calculator.
 #define EOF_IS_LAST false

-#if defined(TEST_CORRECT) || defined(TEST_PERF)
-#include <stdio.h>
-#endif
-
 //Algorithm description:
 //
 //This is heavily built upon suffix sorting; the implementation I use, libdivsufsort, claims
@ -31,10 +21,14 @@
 //
 //The program starts by taking an equal amount of the source file and target file, concatenates that
 // with target first, and suffix sorts it.
-//It also calculates a reverse index, such that reverse[sorted[i]]==i.
 //
-//To find a match, it goes to reverse[outpos], and scans sorted[] up and down for the closest entry
-// that either starts before the current output position, or is somewhere in the source file.
+//To find a match, it finds the sortpos where sorted[sortpos]==outpos. This is a binary search; it's
+// called O(n) times, with O(log n) comparisons per iteration. Each comparison is potentially O(n),
+// but for each matched byte, another iteration is removed from the outer loop, so the comparisons
+// can be considered O(1) each; the sum is O(n log n).
+//
+//After it's found sortpos, it scans sorted[] up and down for the closest entry that either starts
+// before the current output position, or is somewhere in the source file.
 //As the source file comes last, the end-of-file marker (whose value is outside the range of a byte)
 // is guaranteed to not be in the way for a better match.
 //This is called O(n) times, and averages O(1) as at least 50% of sorted[] is in range. However, it
@ -42,8 +36,8 @@
 //
 //It then checks which of the two candidates are superior, by checking how far they match each
 // other, and then checking if the upper one has another correct byte.
-//This is potentially O(n), but for each matched byte, another iteration is removed from the outer
-// loop, so the sum of all calls is O(n).
+//This is potentially O(n), but like the binary search, long matches reduce the outer loop. The sum
+// is O(n).
 //
 //When the program approaches the end of the sorted area, it re-sorts twice as much as last time.
 // This gives O(log n) calls to the suffix sorter.
@ -53,15 +47,15 @@
 //
 //Many details were omitted from the above, but that's the basic setup.
 //
-//Thus, the program is O(max(n log n, n, n) = n log n) average and O(max(n log n, n^2, n) = n^2)
-// worst case.
+//Thus, the program is O(max(n log n, n log n, n, n) = n log n) average and O(max(n log n, n log n,
+// n^2, n) = n^2) worst case.
 //
 //I conclude that the task of finding, understanding and implementing a sub-O(n^2) algorithm for
 // delta patching is resolved.


 //Known cases where this function does not emit the optimal encoding:
-//If a match in the target file would extend further than target_search_size, it is often skipped.
+//If a match in the target file would extend further than target_search_size, it's cut off.
 // Penalty: O(log n), with extremely low constants (it'd require a >256B match to be exactly there).
 // Even for big files, the penalty is very likely to remain zero; even hitting double-digit bytes
 // would require a file designed exactly for that.
@ -70,20 +64,23 @@
 //However, due to better heuristics and others' performance optimizations, this one still beats its
 // competitors.

+//TODO: test multiple same-length matches
+// but only for lengths <= 64, 
+

 //Possible optimizations:
-//divsufsort() takes approximately 2/3 of the total time. create_reverse_index() takes roughly a third of the remainder.
+//divsufsort() takes approximately 1/2 of the total time.
 //Each iteration takes four times as long as the previous one.
 //If each iteration takes 4 times as long as the previous one, then the last one takes 3/4 of the total time.
-//Since divsufsort+create_reverse_index doesn't depend on anything else, the last iteration can be split off to its own thread.
+//Since divsufsort doesn't depend on anything else, the last iteration can be split off to its own thread.
 //This would split it to
-//Search, non-final:   2/9 * 1/4 = 2/36
-//Search, final:       2/9 * 3/4 = 6/36
-//Sort+rev, non-final: 7/9 * 1/4 = 7/36
-//Sort+rev, final:     7/9 * 3/4 = 21/36
+//Search, non-final:   1/2 * 1/4 = 1/8
+//Search, final:       1/2 * 3/4 = 3/8
+//Sort+rev, non-final: 1/2 * 1/4 = 1/8
+//Sort+rev, final:     1/2 * 3/4 = 3/8
 //All non-final must be done sequentially. Both Sort Final and non-final must be done before Search Final can start.
 //This means the final time, if Sort Final is split off, is
-//max(7/36+2/36, 21/36) + 6/36 = 27/36 = 3/4
+//max(1/8+1/8, 3/8) + 3/8 = 6/8 = 3/4
 //of the original time.
 //Due to
 //- the considerable complexity costs (OpenMP doesn't seem able to represent the "insert a wait in
@ -94,13 +91,13 @@
 //    and that the small ones are not, as that'd starve the big one
 //I deem a possible 25% boost not worthwhile.

-
-//Both sorting algorithms claim O(1) memory use (in addition to the bytes and the output). In
-// addition to that, this algorithm uses (source.len*target.len)*(sizeof(uint8_t)+2*sizeof(off_t))
-// bytes of memory, plus the input and output files, plus the patch.
-//For most hardware, this is 9*(source.len+target.len), or 5*(source+target) for the slim one.
+//Both sorting algorithms claim O(1) memory use, in addition to the in/outputs. For most hardware,
+// this is 5*(source.len+target.len).
+//If the output is stored to disk, that's all this algorithm needs as well.


+namespace patch { namespace bps {
+//TODO: HEAVY cleanups needed here
 #include "sais.cpp"
 template<typename sais_index_type>
 static void sufsort(sais_index_type* SA, const uint8_t* T, sais_index_type n) {
@ -116,7 +113,7 @@ static void sufsort(sais_index_type* SA, const uint8_t* T, sais_index_type n) {
 //I'd prefer to let them allocate from an array I give it, but divsuf doesn't allow that, and there
 // are only half a dozen allocations per call anyways.

-//This ends up in libdivsufsort if available, otherwise lite.
+//This ends up in libdivsufsort if available, otherwise sais.cpp.
 #include "divsufsort.h"
 static void sufsort(int32_t* SA, uint8_t* T, int32_t n)
 {
@ -140,6 +137,18 @@ template<typename T> static T max(T a, T b) { return a<b ? b : a; }


 namespace {
+//class filecache {
+//	file& f;
+//	uint32_t crc32;
+//	int bytes_used;
+//	uint8_t bytes[65536];
+//	
+//	void append(const uint8_t * data, size_t len)
+//	{
+//		
+//	}
+//};
+
 struct bps_creator {
 	uint8_t* out;
 	size_t outlen;
@ -206,7 +215,7 @@ struct bps_creator {
 	
 	size_t numtargetread;
 	
-	bps_creator(file* source, file* target, struct mem metadata)
+	bps_creator(const file& source, const file& target, struct mem metadata)
 	{
 		outlen = 0;
 		outbuflen = 128;
@ -220,12 +229,10 @@ struct bps_creator {
 		numtargetread = 0;
 		
 		append((const uint8_t*)"BPS1", 4);
-		appendnum(source->len);
-		appendnum(target->len);
+		appendnum(source.size());
+		appendnum(target.size());
 		appendnum(metadata.len);
 		append(metadata.ptr, metadata.len);
-		
-		setProgress(NULL, NULL);
 	}
 	
 	
@ -342,23 +349,7 @@ struct bps_creator {
 	}
 	
 	
-	bool (*prog_func)(void* userdata, size_t done, size_t total);
-	void* prog_dat;
-	
-	static bool prog_func_null(void* userdata, size_t done, size_t total) { return true; }
-	
-	void setProgress(bool (*progress)(void* userdata, size_t done, size_t total), void* userdata)
-	{
-		if (!progress) progress = prog_func_null;
-		
-		prog_func=progress;
-		prog_dat=userdata;
-	}
-	
-	bool progress(size_t done, size_t total)
-	{
-		return prog_func(prog_dat, done, total);
-	}
+	function<bool(size_t done, size_t total)> progress;
 	
 	
 	void finish(const uint8_t* source, const uint8_t* target)
@ -369,9 +360,9 @@ struct bps_creator {
 			puts("ERROR: patch creates wrong ROM size"),abort();
 #endif
 		
-		appendnum32(crc32(source, sourcelen));
-		appendnum32(crc32(target, targetlen));
-		appendnum32(crc32(out, outlen));
+		appendnum32(crc32(arrayview<byte>(source, sourcelen)));
+		appendnum32(crc32(arrayview<byte>(target, targetlen)));
+		appendnum32(crc32(arrayview<byte>(out, outlen)));
 	}
 	
 	struct mem getpatch()
@ -545,10 +536,8 @@ static void create_buckets(const uint8_t* data, off_t* index, off_t len, off_t*
 }

 template<typename off_t>
-static off_t find_index(off_t pos, const uint8_t* data, off_t datalen, const off_t* index, const off_t* reverse, off_t* buckets)
+static off_t find_index(off_t pos, const uint8_t* data, off_t datalen, const off_t* index, const off_t* buckets)
 {
-	if (reverse) return reverse[pos];
-	
 	//if (datalen<2) return 0;
 	uint16_t bucket = read2(data+pos, datalen-pos);
 //printf("p=%i b=%i\n",pos,bucket);
@ -654,29 +643,29 @@ off_t lerp(off_t x, off_t y, float frac)
 }

 template<typename off_t>
-static bpserror bps_create_suf_core(file* source, file* target, bool moremem, struct bps_creator * out)
+static result create_suf_core(const file& source, const file& target, struct bps_creator * out)
 {
 #define error(which) do { err = which; goto error; } while(0)
-	bpserror err;
+	result err;
 	
-	size_t realsourcelen = source->len;
-	size_t realtargetlen = target->len;
+	size_t realsourcelen = source.size();
+	size_t realtargetlen = target.size();
 	
 	size_t overflowtest = realsourcelen + realtargetlen;
 	
 	//source+target length is bigger than size_t
-	if (overflowtest < realsourcelen) return bps_too_big;
+	if (overflowtest < realsourcelen) return e_too_big;
 	
 	//source+target doesn't fit in unsigned off_t
-	if ((size_t)(off_t)overflowtest != overflowtest) return bps_too_big;
+	if ((size_t)(off_t)overflowtest != overflowtest) return e_too_big;
 	
 	//source+target doesn't fit in signed off_t
-	if ((off_t)overflowtest < 0) return bps_too_big;
+	if ((off_t)overflowtest < 0) return e_too_big;
 	
 	//the mallocs would overflow
-	if (realsourcelen+realtargetlen >= SIZE_MAX/sizeof(off_t)) return bps_too_big;
+	if (realsourcelen+realtargetlen >= SIZE_MAX/sizeof(off_t)) return e_too_big;
 	
-	if (realsourcelen+realtargetlen >= out->maxsize()) return bps_too_big;
+	if (realsourcelen+realtargetlen >= out->maxsize()) return e_too_big;
 	
 	
 	off_t sourcelen = realsourcelen;
@ -686,19 +675,14 @@ static bpserror bps_create_suf_core(file* source, file* target, bool moremem, st
 	
 	off_t* sorted = (off_t*)malloc(sizeof(off_t)*(realsourcelen+realtargetlen));
 	
-	off_t* sorted_inverse = NULL;
-	if (moremem) sorted_inverse = (off_t*)malloc(sizeof(off_t)*(realsourcelen+realtargetlen));
+	off_t* buckets = (off_t*)malloc(sizeof(off_t)*65537);
 	
-	off_t* buckets = NULL;
-	if (!sorted_inverse) buckets = (off_t*)malloc(sizeof(off_t)*65537);
-	
-	if (!sorted || !mem_joined || (!sorted_inverse && !buckets))
+	if (!sorted || !mem_joined || !buckets)
 	{
 		free(mem_joined);
 		free(sorted);
-		free(sorted_inverse);
 		free(buckets);
-		return bps_out_of_mem;
+		return e_out_of_mem;
 	}
 	
 	//sortedsize is how much of the target file is sorted
@ -720,35 +704,32 @@ static bpserror bps_create_suf_core(file* source, file* target, bool moremem, st
 		reindex:
 			
 			//this isn't an exact science
-			const float percSort = sorted_inverse ? 0.67 : 0.50;
-			const float percInv = sorted_inverse ? 0.11 : 0.10;
-			//const float percFind = sorted_inverse ? 0.22 : 0.40; // unused
+			const float percSort = 0.50;
+			const float percBuck = 0.10;
+			//const float percFind = 0.40; // unused
 			
 			const size_t progPreSort = lerp(prevsortedsize, sortedsize, 0);
-			const size_t progPreInv = lerp(prevsortedsize, sortedsize, percSort);
-			const size_t progPreFind = lerp(prevsortedsize, sortedsize, percSort+percInv);
+			const size_t progPreBuck = lerp(prevsortedsize, sortedsize, percSort);
+			const size_t progPreFind = lerp(prevsortedsize, sortedsize, percSort+percBuck);
 			
 			prevsortedsize = sortedsize;
 			
-			if (!out->progress(progPreSort, targetlen)) error(bps_canceled);
+			if (out->progress(progPreSort, targetlen)) error(e_canceled);
 			
-			if (target->read(mem_joined, 0, sortedsize) < (size_t)sortedsize) error(bps_io);
-			if (source->read(mem_joined+sortedsize, 0, sourcelen) < (size_t)sourcelen) error(bps_io);
+			if (target.read(arrayvieww<byte>(mem_joined, sortedsize), 0) < (size_t)sortedsize) error(e_io);
+			if (source.read(arrayvieww<byte>(mem_joined+sortedsize, sourcelen), 0) < (size_t)sourcelen) error(e_io);
 			out->move_target(mem_joined);
 			sufsort(sorted, mem_joined, sortedsize+sourcelen);
 			
-			if (!out->progress(progPreInv, targetlen)) error(bps_canceled);
+			if (out->progress(progPreBuck, targetlen)) error(e_canceled);
 			
-			if (sorted_inverse)
-				create_reverse_index(sorted, sorted_inverse, sortedsize+sourcelen);
-			else
-				create_buckets(mem_joined, sorted, sortedsize+sourcelen, buckets);
+			create_buckets(mem_joined, sorted, sortedsize+sourcelen, buckets);
 			
-			if (!out->progress(progPreFind, targetlen)) error(bps_canceled);
+			if (out->progress(progPreFind, targetlen)) error(e_canceled);
 		}
 		
 		off_t matchlen = 0;
-		off_t matchpos = adjust_match(find_index(outpos, mem_joined, sortedsize+sourcelen, sorted, sorted_inverse, buckets),
+		off_t matchpos = adjust_match(find_index(outpos, mem_joined, sortedsize+sourcelen, sorted, buckets),
 		                              mem_joined+outpos, sortedsize-outpos,
 		                              mem_joined,sortedsize+sourcelen, outpos,sortedsize,
 		                              sorted, sortedsize+sourcelen,
@ -771,11 +752,10 @@ static bpserror bps_create_suf_core(file* source, file* target, bool moremem, st
 	
 	out->finish(mem_joined+sortedsize, mem_joined);
 	
-	err = bps_ok;
+	err = e_ok;
 	
 error:
 	free(buckets);
-	free(sorted_inverse);
 	free(sorted);
 	free(mem_joined);
 	
@ -783,38 +763,41 @@ error:
 }


-template<typename T> static bpserror bps_create_suf_pick(file* source, file* target, bool moremem, struct bps_creator * bps);
-template<> bpserror bps_create_suf_pick<uint32_t>(file* source, file* target, bool moremem, struct bps_creator * bps)
+template<typename T> static result create_suf_pick(const file& source, const file& target, struct bps_creator * bps);
+template<> result create_suf_pick<uint32_t>(const file& source, const file& target, struct bps_creator * bps)
 {
-	return bps_create_suf_core<int32_t>(source, target, moremem, bps);
+	return create_suf_core<int32_t>(source, target, bps);
 }
-template<> bpserror bps_create_suf_pick<uint64_t>(file* source, file* target, bool moremem, struct bps_creator * bps)
+template<> result create_suf_pick<uint64_t>(const file& source, const file& target, struct bps_creator * bps)
 {
-	bpserror err = bps_create_suf_core<int32_t>(source, target, moremem, bps);
-	if (err==bps_too_big) err = bps_create_suf_core<int64_t>(source, target, moremem, bps);
+	result err = create_suf_core<int32_t>(source, target, bps);
+	if (err==e_too_big) err = create_suf_core<int64_t>(source, target, bps);
 	return err;
 }

 //This one picks a function based on 32-bit integers if that fits. This halves memory use for common inputs.
 //It also handles some stuff related to the BPS headers and footers.
-extern "C"
-bpserror bps_create_delta(file* source, file* target, struct mem metadata, struct mem * patchmem,
-                          bool (*progress)(void* userdata, size_t done, size_t total), void* userdata, bool moremem)
+result create(const file& source, const file& target, const file& metadata, file& patch,
+              function<bool(size_t done, size_t total)> progress)
 {
-	bps_creator bps(source, target, metadata);
-	bps.setProgress(progress, userdata);
+	mem metamem = metadata.mmap();
+	bps_creator bps(source, target, metamem);
+	metadata.unmap(metamem.v());
+	bps.progress = progress;
 	
 	size_t maindata = bps.outlen;
 	
 	//off_t must be signed
-	bpserror err = bps_create_suf_pick<size_t>(source, target, moremem, &bps);
-	if (err!=bps_ok) return err;
+	result err = create_suf_pick<size_t>(source, target, &bps);
+	if (err!=e_ok) return err;
 	
-	*patchmem = bps.getpatch();
+	mem patchmem = bps.getpatch();
+	patch.write(patchmem.v());
+	free(patchmem.ptr);
 	
-	while ((patchmem->ptr[maindata]&0x80) == 0x00) maindata++;
-	if (maindata==patchmem->len-12-1) return bps_identical;
-	return bps_ok;
+	while ((patchmem.ptr[maindata]&0x80) == 0x00) maindata++;
+	if (maindata==patchmem.len-12-1) return e_identical;
+	return e_ok;
 }


@ -876,3 +859,4 @@ printf("%i/%i=%f\n",match_len_tot,match_len_n,(float)match_len_tot/match_len_n);
 #endif
 }
 #endif
+}}
--- a/patch/bps.cpp
+++ b/patch/bps.cpp
@ -1,6 +1,7 @@
 #include "patch.h"

 namespace patch { namespace bps {
+//TODO: HEAVY cleanups needed here
 static uint32_t read32(uint8_t * ptr)
 {
 	uint32_t out;
--- a/patch/divsufsort.h
+++ b/patch/divsufsort.h
--- a/patch/patch.h
+++ b/patch/patch.h
@ -74,6 +74,7 @@ struct info {
 //Deprecated
 struct mem {
 	mem() : ptr(NULL), len(0) {}
+	mem(uint8_t* ptr, size_t len) : ptr(ptr), len(len) {}
 	mem(arrayview<byte> v) : ptr((byte*)v.ptr()), len(v.size()) {}
 	arrayvieww<byte> v() { return arrayvieww<byte>(ptr, len); }
 	uint8_t * ptr;
--- a/patch/sais.cpp
+++ b/patch/sais.cpp