Rewrite this one to Arlib API, and wipe out moremem

This commit is contained in:
Alcaro 2016-12-20 22:19:58 +01:00
parent 8d1780bb29
commit 0e31df85fd
6 changed files with 101 additions and 111 deletions

View File

@ -4,4 +4,8 @@ ARWUTF = 1
EXTRAOBJ += obj/divsufsort-c$(OBJSUFFIX).o
SOURCES += patch/*.cpp
DOMAINS += LDSS
SOURCES_LDSS := libdivsufsort-2.0.1/lib/divsufsort.c libdivsufsort-2.0.1/lib/sssort.c libdivsufsort-2.0.1/lib/trsort.c
CFLAGS_LDSS := -Ilibdivsufsort-2.0.1/include -DHAVE_CONFIG_H
include arlib/Makefile

View File

@ -1,10 +1,4 @@
#include "libbps.h"
#include "arlib/crc32.h"
#include "arlib/file.h"
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include "patch.h"
//These two give minor performance penalties and will print some random stuff to stdout.
//The former will verify the correctness of the output patch, the latter will print some performance data.
@ -19,10 +13,6 @@
//If it's something else, get a non-broken array calculator.
#define EOF_IS_LAST false
#if defined(TEST_CORRECT) || defined(TEST_PERF)
#include <stdio.h>
#endif
//Algorithm description:
//
//This is heavily built upon suffix sorting; the implementation I use, libdivsufsort, claims
@ -31,10 +21,14 @@
//
//The program starts by taking an equal amount of the source file and target file, concatenates that
// with target first, and suffix sorts it.
//It also calculates a reverse index, such that reverse[sorted[i]]==i.
//
//To find a match, it goes to reverse[outpos], and scans sorted[] up and down for the closest entry
// that either starts before the current output position, or is somewhere in the source file.
//To find a match, it finds the sortpos where sorted[sortpos]==outpos. This is a binary search; it's
// called O(n) times, with O(log n) comparisons per iteration. Each comparison is potentially O(n),
// but for each matched byte, another iteration is removed from the outer loop, so the comparisons
// can be considered O(1) each; the sum is O(n log n).
//
//After it's found sortpos, it scans sorted[] up and down for the closest entry that either starts
// before the current output position, or is somewhere in the source file.
//As the source file comes last, the end-of-file marker (whose value is outside the range of a byte)
// is guaranteed to not be in the way for a better match.
//This is called O(n) times, and averages O(1) as at least 50% of sorted[] is in range. However, it
@ -42,8 +36,8 @@
//
//It then checks which of the two candidates are superior, by checking how far they match each
// other, and then checking if the upper one has another correct byte.
//This is potentially O(n), but for each matched byte, another iteration is removed from the outer
// loop, so the sum of all calls is O(n).
//This is potentially O(n), but like the binary search, long matches reduce the outer loop. The sum
// is O(n).
//
//When the program approaches the end of the sorted area, it re-sorts twice as much as last time.
// This gives O(log n) calls to the suffix sorter.
@ -53,15 +47,15 @@
//
//Many details were omitted from the above, but that's the basic setup.
//
//Thus, the program is O(max(n log n, n, n) = n log n) average and O(max(n log n, n^2, n) = n^2)
// worst case.
//Thus, the program is O(max(n log n, n log n, n, n) = n log n) average and O(max(n log n, n log n,
// n^2, n) = n^2) worst case.
//
//I conclude that the task of finding, understanding and implementing a sub-O(n^2) algorithm for
// delta patching is resolved.
//Known cases where this function does not emit the optimal encoding:
//If a match in the target file would extend further than target_search_size, it is often skipped.
//If a match in the target file would extend further than target_search_size, it's cut off.
// Penalty: O(log n), with extremely low constants (it'd require a >256B match to be exactly there).
// Even for big files, the penalty is very likely to remain zero; even hitting double-digit bytes
// would require a file designed exactly for that.
@ -70,20 +64,23 @@
//However, due to better heuristics and others' performance optimizations, this one still beats its
// competitors.
//TODO: test multiple same-length matches
// but only for lengths <= 64,
//Possible optimizations:
//divsufsort() takes approximately 2/3 of the total time. create_reverse_index() takes roughly a third of the remainder.
//divsufsort() takes approximately 1/2 of the total time.
//Each iteration takes four times as long as the previous one.
//If each iteration takes 4 times as long as the previous one, then the last one takes 3/4 of the total time.
//Since divsufsort+create_reverse_index doesn't depend on anything else, the last iteration can be split off to its own thread.
//Since divsufsort doesn't depend on anything else, the last iteration can be split off to its own thread.
//This would split it to
//Search, non-final: 2/9 * 1/4 = 2/36
//Search, final: 2/9 * 3/4 = 6/36
//Sort+rev, non-final: 7/9 * 1/4 = 7/36
//Sort+rev, final: 7/9 * 3/4 = 21/36
//Search, non-final: 1/2 * 1/4 = 1/8
//Search, final: 1/2 * 3/4 = 3/8
//Sort+rev, non-final: 1/2 * 1/4 = 1/8
//Sort+rev, final: 1/2 * 3/4 = 3/8
//All non-final must be done sequentially. Both Sort Final and non-final must be done before Search Final can start.
//This means the final time, if Sort Final is split off, is
//max(7/36+2/36, 21/36) + 6/36 = 27/36 = 3/4
//max(1/8+1/8, 3/8) + 3/8 = 6/8 = 3/4
//of the original time.
//Due to
//- the considerable complexity costs (OpenMP doesn't seem able to represent the "insert a wait in
@ -94,13 +91,13 @@
// and that the small ones are not, as that'd starve the big one
//I deem a possible 25% boost not worthwhile.
//Both sorting algorithms claim O(1) memory use (in addition to the bytes and the output). In
// addition to that, this algorithm uses (source.len*target.len)*(sizeof(uint8_t)+2*sizeof(off_t))
// bytes of memory, plus the input and output files, plus the patch.
//For most hardware, this is 9*(source.len+target.len), or 5*(source+target) for the slim one.
//Both sorting algorithms claim O(1) memory use, in addition to the in/outputs. For most hardware,
// this is 5*(source.len+target.len).
//If the output is stored to disk, that's all this algorithm needs as well.
namespace patch { namespace bps {
//TODO: HEAVY cleanups needed here
#include "sais.cpp"
template<typename sais_index_type>
static void sufsort(sais_index_type* SA, const uint8_t* T, sais_index_type n) {
@ -116,7 +113,7 @@ static void sufsort(sais_index_type* SA, const uint8_t* T, sais_index_type n) {
//I'd prefer to let them allocate from an array I give it, but divsuf doesn't allow that, and there
// are only half a dozen allocations per call anyways.
//This ends up in libdivsufsort if available, otherwise lite.
//This ends up in libdivsufsort if available, otherwise sais.cpp.
#include "divsufsort.h"
static void sufsort(int32_t* SA, uint8_t* T, int32_t n)
{
@ -140,6 +137,18 @@ template<typename T> static T max(T a, T b) { return a<b ? b : a; }
namespace {
//class filecache {
// file& f;
// uint32_t crc32;
// int bytes_used;
// uint8_t bytes[65536];
//
// void append(const uint8_t * data, size_t len)
// {
//
// }
//};
struct bps_creator {
uint8_t* out;
size_t outlen;
@ -206,7 +215,7 @@ struct bps_creator {
size_t numtargetread;
bps_creator(file* source, file* target, struct mem metadata)
bps_creator(const file& source, const file& target, struct mem metadata)
{
outlen = 0;
outbuflen = 128;
@ -220,12 +229,10 @@ struct bps_creator {
numtargetread = 0;
append((const uint8_t*)"BPS1", 4);
appendnum(source->len);
appendnum(target->len);
appendnum(source.size());
appendnum(target.size());
appendnum(metadata.len);
append(metadata.ptr, metadata.len);
setProgress(NULL, NULL);
}
@ -342,23 +349,7 @@ struct bps_creator {
}
bool (*prog_func)(void* userdata, size_t done, size_t total);
void* prog_dat;
static bool prog_func_null(void* userdata, size_t done, size_t total) { return true; }
void setProgress(bool (*progress)(void* userdata, size_t done, size_t total), void* userdata)
{
if (!progress) progress = prog_func_null;
prog_func=progress;
prog_dat=userdata;
}
bool progress(size_t done, size_t total)
{
return prog_func(prog_dat, done, total);
}
function<bool(size_t done, size_t total)> progress;
void finish(const uint8_t* source, const uint8_t* target)
@ -369,9 +360,9 @@ struct bps_creator {
puts("ERROR: patch creates wrong ROM size"),abort();
#endif
appendnum32(crc32(source, sourcelen));
appendnum32(crc32(target, targetlen));
appendnum32(crc32(out, outlen));
appendnum32(crc32(arrayview<byte>(source, sourcelen)));
appendnum32(crc32(arrayview<byte>(target, targetlen)));
appendnum32(crc32(arrayview<byte>(out, outlen)));
}
struct mem getpatch()
@ -545,10 +536,8 @@ static void create_buckets(const uint8_t* data, off_t* index, off_t len, off_t*
}
template<typename off_t>
static off_t find_index(off_t pos, const uint8_t* data, off_t datalen, const off_t* index, const off_t* reverse, off_t* buckets)
static off_t find_index(off_t pos, const uint8_t* data, off_t datalen, const off_t* index, const off_t* buckets)
{
if (reverse) return reverse[pos];
//if (datalen<2) return 0;
uint16_t bucket = read2(data+pos, datalen-pos);
//printf("p=%i b=%i\n",pos,bucket);
@ -654,29 +643,29 @@ off_t lerp(off_t x, off_t y, float frac)
}
template<typename off_t>
static bpserror bps_create_suf_core(file* source, file* target, bool moremem, struct bps_creator * out)
static result create_suf_core(const file& source, const file& target, struct bps_creator * out)
{
#define error(which) do { err = which; goto error; } while(0)
bpserror err;
result err;
size_t realsourcelen = source->len;
size_t realtargetlen = target->len;
size_t realsourcelen = source.size();
size_t realtargetlen = target.size();
size_t overflowtest = realsourcelen + realtargetlen;
//source+target length is bigger than size_t
if (overflowtest < realsourcelen) return bps_too_big;
if (overflowtest < realsourcelen) return e_too_big;
//source+target doesn't fit in unsigned off_t
if ((size_t)(off_t)overflowtest != overflowtest) return bps_too_big;
if ((size_t)(off_t)overflowtest != overflowtest) return e_too_big;
//source+target doesn't fit in signed off_t
if ((off_t)overflowtest < 0) return bps_too_big;
if ((off_t)overflowtest < 0) return e_too_big;
//the mallocs would overflow
if (realsourcelen+realtargetlen >= SIZE_MAX/sizeof(off_t)) return bps_too_big;
if (realsourcelen+realtargetlen >= SIZE_MAX/sizeof(off_t)) return e_too_big;
if (realsourcelen+realtargetlen >= out->maxsize()) return bps_too_big;
if (realsourcelen+realtargetlen >= out->maxsize()) return e_too_big;
off_t sourcelen = realsourcelen;
@ -686,19 +675,14 @@ static bpserror bps_create_suf_core(file* source, file* target, bool moremem, st
off_t* sorted = (off_t*)malloc(sizeof(off_t)*(realsourcelen+realtargetlen));
off_t* sorted_inverse = NULL;
if (moremem) sorted_inverse = (off_t*)malloc(sizeof(off_t)*(realsourcelen+realtargetlen));
off_t* buckets = (off_t*)malloc(sizeof(off_t)*65537);
off_t* buckets = NULL;
if (!sorted_inverse) buckets = (off_t*)malloc(sizeof(off_t)*65537);
if (!sorted || !mem_joined || (!sorted_inverse && !buckets))
if (!sorted || !mem_joined || !buckets)
{
free(mem_joined);
free(sorted);
free(sorted_inverse);
free(buckets);
return bps_out_of_mem;
return e_out_of_mem;
}
//sortedsize is how much of the target file is sorted
@ -720,35 +704,32 @@ static bpserror bps_create_suf_core(file* source, file* target, bool moremem, st
reindex:
//this isn't an exact science
const float percSort = sorted_inverse ? 0.67 : 0.50;
const float percInv = sorted_inverse ? 0.11 : 0.10;
//const float percFind = sorted_inverse ? 0.22 : 0.40; // unused
const float percSort = 0.50;
const float percBuck = 0.10;
//const float percFind = 0.40; // unused
const size_t progPreSort = lerp(prevsortedsize, sortedsize, 0);
const size_t progPreInv = lerp(prevsortedsize, sortedsize, percSort);
const size_t progPreFind = lerp(prevsortedsize, sortedsize, percSort+percInv);
const size_t progPreBuck = lerp(prevsortedsize, sortedsize, percSort);
const size_t progPreFind = lerp(prevsortedsize, sortedsize, percSort+percBuck);
prevsortedsize = sortedsize;
if (!out->progress(progPreSort, targetlen)) error(bps_canceled);
if (out->progress(progPreSort, targetlen)) error(e_canceled);
if (target->read(mem_joined, 0, sortedsize) < (size_t)sortedsize) error(bps_io);
if (source->read(mem_joined+sortedsize, 0, sourcelen) < (size_t)sourcelen) error(bps_io);
if (target.read(arrayvieww<byte>(mem_joined, sortedsize), 0) < (size_t)sortedsize) error(e_io);
if (source.read(arrayvieww<byte>(mem_joined+sortedsize, sourcelen), 0) < (size_t)sourcelen) error(e_io);
out->move_target(mem_joined);
sufsort(sorted, mem_joined, sortedsize+sourcelen);
if (!out->progress(progPreInv, targetlen)) error(bps_canceled);
if (out->progress(progPreBuck, targetlen)) error(e_canceled);
if (sorted_inverse)
create_reverse_index(sorted, sorted_inverse, sortedsize+sourcelen);
else
create_buckets(mem_joined, sorted, sortedsize+sourcelen, buckets);
create_buckets(mem_joined, sorted, sortedsize+sourcelen, buckets);
if (!out->progress(progPreFind, targetlen)) error(bps_canceled);
if (out->progress(progPreFind, targetlen)) error(e_canceled);
}
off_t matchlen = 0;
off_t matchpos = adjust_match(find_index(outpos, mem_joined, sortedsize+sourcelen, sorted, sorted_inverse, buckets),
off_t matchpos = adjust_match(find_index(outpos, mem_joined, sortedsize+sourcelen, sorted, buckets),
mem_joined+outpos, sortedsize-outpos,
mem_joined,sortedsize+sourcelen, outpos,sortedsize,
sorted, sortedsize+sourcelen,
@ -771,11 +752,10 @@ static bpserror bps_create_suf_core(file* source, file* target, bool moremem, st
out->finish(mem_joined+sortedsize, mem_joined);
err = bps_ok;
err = e_ok;
error:
free(buckets);
free(sorted_inverse);
free(sorted);
free(mem_joined);
@ -783,38 +763,41 @@ error:
}
template<typename T> static bpserror bps_create_suf_pick(file* source, file* target, bool moremem, struct bps_creator * bps);
template<> bpserror bps_create_suf_pick<uint32_t>(file* source, file* target, bool moremem, struct bps_creator * bps)
template<typename T> static result create_suf_pick(const file& source, const file& target, struct bps_creator * bps);
template<> result create_suf_pick<uint32_t>(const file& source, const file& target, struct bps_creator * bps)
{
return bps_create_suf_core<int32_t>(source, target, moremem, bps);
return create_suf_core<int32_t>(source, target, bps);
}
template<> bpserror bps_create_suf_pick<uint64_t>(file* source, file* target, bool moremem, struct bps_creator * bps)
template<> result create_suf_pick<uint64_t>(const file& source, const file& target, struct bps_creator * bps)
{
bpserror err = bps_create_suf_core<int32_t>(source, target, moremem, bps);
if (err==bps_too_big) err = bps_create_suf_core<int64_t>(source, target, moremem, bps);
result err = create_suf_core<int32_t>(source, target, bps);
if (err==e_too_big) err = create_suf_core<int64_t>(source, target, bps);
return err;
}
//This one picks a function based on 32-bit integers if that fits. This halves memory use for common inputs.
//It also handles some stuff related to the BPS headers and footers.
extern "C"
bpserror bps_create_delta(file* source, file* target, struct mem metadata, struct mem * patchmem,
bool (*progress)(void* userdata, size_t done, size_t total), void* userdata, bool moremem)
result create(const file& source, const file& target, const file& metadata, file& patch,
function<bool(size_t done, size_t total)> progress)
{
bps_creator bps(source, target, metadata);
bps.setProgress(progress, userdata);
mem metamem = metadata.mmap();
bps_creator bps(source, target, metamem);
metadata.unmap(metamem.v());
bps.progress = progress;
size_t maindata = bps.outlen;
//off_t must be signed
bpserror err = bps_create_suf_pick<size_t>(source, target, moremem, &bps);
if (err!=bps_ok) return err;
result err = create_suf_pick<size_t>(source, target, &bps);
if (err!=e_ok) return err;
*patchmem = bps.getpatch();
mem patchmem = bps.getpatch();
patch.write(patchmem.v());
free(patchmem.ptr);
while ((patchmem->ptr[maindata]&0x80) == 0x00) maindata++;
if (maindata==patchmem->len-12-1) return bps_identical;
return bps_ok;
while ((patchmem.ptr[maindata]&0x80) == 0x00) maindata++;
if (maindata==patchmem.len-12-1) return e_identical;
return e_ok;
}
@ -876,3 +859,4 @@ printf("%i/%i=%f\n",match_len_tot,match_len_n,(float)match_len_tot/match_len_n);
#endif
}
#endif
}}

View File

@ -1,6 +1,7 @@
#include "patch.h"
namespace patch { namespace bps {
//TODO: HEAVY cleanups needed here
static uint32_t read32(uint8_t * ptr)
{
uint32_t out;

View File

@ -74,6 +74,7 @@ struct info {
//Deprecated
struct mem {
mem() : ptr(NULL), len(0) {}
mem(uint8_t* ptr, size_t len) : ptr(ptr), len(len) {}
mem(arrayview<byte> v) : ptr((byte*)v.ptr()), len(v.size()) {}
arrayvieww<byte> v() { return arrayvieww<byte>(ptr, len); }
uint8_t * ptr;