#include "patch.h" //These two give minor performance penalties and will print some random stuff to stdout. //The former will verify the correctness of the output patch, the latter will print some performance data. //Can be useful for debugging, but should be disabled for release builds. #ifdef BPS_STANDALONE #endif //#define TEST_CORRECT //#define TEST_PERF //If the suffix array of [0, 0, 0, 0] is [3, 2, 1, 0], set to true. If it's [0, 1, 2, 3], this is false. //If it's [4, 3, 2, 1, 0] or [0, 1, 2, 3, 4], remove the 4 (easily done with some pointer math), and follow the above. //If it's something else, get a non-broken array calculator. #define EOF_IS_LAST false //Algorithm description: // //This is heavily built upon suffix sorting; the implementation I use, libdivsufsort, claims // O(n log n) complexity, so I'll believe that. There is also SA-IS, which claims O(n), but if that // is true, its constant factors are ridiculously high. // //The program starts by taking an equal amount of the source file and target file, concatenates that // with target first, and suffix sorts it. // //To find a match, it finds the sortpos where sorted[sortpos]==outpos. This is a binary search; it's // called O(n) times, with O(log n) comparisons per iteration. Each comparison is potentially O(n), // but for each matched byte, another iteration is removed from the outer loop, so the comparisons // can be considered O(1) each; the sum is O(n log n). //It could be replaced with a reverse index, reverse[sorted[x]]==x for all x, but that would cost a // lot of memory, and due to the cost of creating said index and only a few entries being used, it // doesn't save any time in practice. // //After it's found sortpos, it scans sorted[] up and down for the closest entry that either starts // before the current output position, or is somewhere in the source file. //As the source file comes last, the end-of-file marker (whose value is outside the range of a byte) // is guaranteed to not be in the way for a better match. //This is called O(n) times, and averages O(1) as at least 50% of sorted[] is in range. However, it // is worst-case O(n) for sorted inputs, giving a total of O(n^2). // //It then checks which of the two candidates are superior, by checking how far they match each // other, and then checking if the upper one has another correct byte. //This is potentially O(n), but like the binary search, long matches reduce the outer loop. The sum // is O(n). // //When the program approaches the end of the sorted area, it re-sorts twice as much as last time. // This gives O(log n) calls to the suffix sorter. //Given O(n log n) for one sorting step, the time taken is O(n/1 log n/1 + n/2 log n/2 + // n/4 log n/4 + ...), which is strictly less than O(n/1 log n + n/2 log n + n/4 log n + ...), which // equals O(2n log n), which is O(n log n). // //Many details were omitted from the above, but that's the basic setup. // //Thus, the program is O(n log n) + O(n log n) + O(n) + O(n) = O(n log n) average and O(n log n) + // O(n log n) + O(n^2) + O(n) = O(n^2) worst case. // //As the quadratic worst case is not hit for random data or any other plausible output file, I // conclude that the task of finding, understanding and implementing a sub-quadratic algorithm for // delta patching is resolved. //Known cases where this function does not emit the optimal encoding: //If a match in the target file would extend further than target_search_size, it's cut off. // Penalty: O(log n), with extremely low constants (it'd require a >256B match to be exactly there). // Even for big files, the penalty is very likely to remain zero; even hitting double-digit bytes // would require a file designed exactly for that. //If multiple matches are equally good, it picks one at random, not the one that's cheaper to encode. // Penalty: Likely O(n) or O(n log log n), with low constants. I'd guess ~1.4% for my 48MB test file. //However, due to better heuristics and others' performance optimizations, this one still beats its // competitors. //Heuristics are likely somewhat mistuned. //TODO: test multiple same-length matches // but only for lengths <= 16 or something, otherwise it'd take too long //Possible optimizations: //divsufsort() takes approximately 1/2 of the total time. //Each iteration takes four times as long as the previous one. //If each iteration takes 4 times as long as the previous one, then the last one takes 3/4 of the total time. //Since divsufsort doesn't depend on anything else, the last iteration can be split off to its own thread. //This would split it to //Search, non-final: 1/2 * 1/4 = 1/8 //Search, final: 1/2 * 3/4 = 3/8 //Sort, non-final: 1/2 * 1/4 = 1/8 //Sort, final: 1/2 * 3/4 = 3/8 //All non-final must be done sequentially. Both Sort Final and non-final must be done before Search Final can start. //This means the final time, if Sort Final is split off, is //max(1/8+1/8, 3/8) + 3/8 = 6/8 = 3/4 //of the original time. //Due to //- the considerable complexity costs (OpenMP doesn't seem able to represent the "insert a wait in // the middle of this while loop" operation I would need) //- the added memory use, approximately 25% higher - it's already high enough //- libdivsufsort already using threads, which would make the gains lower // and would increase complexity, as I have to ensure the big one remains threaded - // and that the small ones are not, as that'd starve the big one //I deem a possible 25% boost not worthwhile. //Another optimization would be if a faster suffix sorting algorithm available. //Both SA-IS and libdivsufsort claim O(1) memory use, in addition to the in/outputs. For most // hardware, this is 5*(source.len+target.len). //The output file is also stored in memory, which is potentially slightly more than the output file // size. This could be changed without too much trouble, but is unlikely to be worth it. namespace patch { namespace bps { #include "sais.cpp" template static void sufsort(sais_index_type* SA, const uint8_t* T, sais_index_type n) { if(n <= 1) { if(n == 1) SA[0] = 0; return; } sais_main(T, SA, 0, n, 256); } //According to , divsufsort achieves // approximately half the time of SAIS for nearly all files, despite SAIS' promises of linear // performance (divsufsort claims O(n log n)). //divsufsort only allocates O(1) for some radix/bucket sorting. SAIS seems constant too. //I'd prefer to let them allocate from an array I give it, but divsuf doesn't allow that, and there // are only half a dozen allocations per call anyways. //This ends up in libdivsufsort if available, otherwise sais.cpp. #include "divsufsort.h" static void sufsort(int32_t* SA, uint8_t* T, int32_t n) { divsufsort(T, SA, n); } #ifdef USE_DIVSUFSORT64 #include "divsufsort64.h" static void sufsort(int64_t* SA, uint8_t* T, int64_t n) { divsufsort(T, SA, n); } #endif namespace { //class filecache { // file& f; // uint32_t crc32; // int bytes_used; // uint8_t bytes[65536]; // // void append(const uint8_t * data, size_t len) // { // // } //}; struct bps_creator { array out; void appendnum(size_t num) { #ifdef TEST_CORRECT if (num > 1000000000) printf("ERROR: Attempt to write %.8lX\n",(unsigned long)num),abort(); #endif while (num >= 128) { out.append((num&0x7F)); num>>=7; num--; } out.append(num|0x80); } void appendnum32(uint32_t num) { out.append(num>>0); out.append(num>>8); out.append(num>>16); out.append(num>>24); } static size_t maxsize() { return SIZE_MAX>>2; // can probably be reduced to SIZE_MAX>>1, but the mallocs overflow at that point anyways. } size_t sourcelen; size_t targetlen; const uint8_t* targetmem; enum bpscmd { SourceRead, TargetRead, SourceCopy, TargetCopy }; size_t outpos; size_t sourcecopypos; size_t targetcopypos; size_t numtargetread; bps_creator(const file& source, const file& target, const file& metadata) { outpos = 0; sourcelen = source.size(); targetlen = target.size(); sourcecopypos = 0; targetcopypos = 0; numtargetread = 0; out += arrayview((byte*)"BPS1", 4); appendnum(sourcelen); appendnum(targetlen); appendnum(metadata.size()); arrayview tmp = metadata.mmap(); out += tmp; metadata.unmap(tmp); } void move_target(const uint8_t* ptr) { targetmem = ptr; } size_t encode_delta(size_t prev, size_t next) { bool negative = (next(targetmem+outpos-numtargetread, numtargetread); numtargetread = 0; } size_t emit_source_copy(size_t location, size_t count) { if (location == outpos) return emit_source_read(location, count); flush_target_read(); append_cmd(SourceCopy, count); append_delta(sourcecopypos, location); sourcecopypos = location+count; outpos += count; return count; } size_t emit_source_read(size_t location, size_t count) { flush_target_read(); #ifdef TEST_CORRECT if (location != outpos) puts("ERROR: SourceRead not from source pointer"),abort(); #endif append_cmd(SourceRead, count); outpos+=count; return count; } size_t emit_target_copy(size_t location, size_t count) { flush_target_read(); append_cmd(TargetCopy, count); append_delta(targetcopypos, location); targetcopypos = location+count; outpos += count; return count; } size_t emit_target_read() { numtargetread++; outpos++; return 1; } size_t abs_diff(size_t a, size_t b) { return (b= 1+cost+hastargetread+(len==1); } //Return value is how many bytes were used. If you believe the given one sucks, use TargetRead and return 1. size_t match(bool is_target, size_t pos, size_t len) { if (!use_match( numtargetread, (!is_target && pos==outpos) ? 1 : // SourceRead (num_cost(abs_diff(pos, (is_target ? targetcopypos : sourcecopypos)))+1), len )) { return emit_target_read(); } if (is_target) return emit_target_copy(pos, len); else return emit_source_copy(pos, len); } function progress; void finish(const uint8_t* source, const uint8_t* target) { flush_target_read(); #ifdef TEST_CORRECT if (outpos != targetlen) puts("ERROR: patch creates wrong ROM size"),abort(); #endif appendnum32(crc32(arrayview(source, sourcelen))); appendnum32(crc32(arrayview(target, targetlen))); appendnum32(crc32(out)); } size_t outlen() { return out.size(); } array getpatch() { return std::move(out); } }; } #ifdef TEST_PERF static int match_len_n=0; static int match_len_tot=0; #endif static size_t match_len(const uint8_t* a, const uint8_t* b, size_t len) { //don't replace with memcmp_d, the average match length is so small it's a net loss size_t i; for (i=0;i static off_t pick_best_of_two(const uint8_t* search, off_t searchlen, const uint8_t* data, off_t datalen, off_t a, off_t b, off_t* bestlen) { off_t commonlen = match_len(data+a, data+b, min(datalen-a, datalen-b)); if (commonlen>=searchlen) { *bestlen=searchlen; return a; } if (a+commonlen static off_t adjust_match(off_t match, const uint8_t* search, off_t searchlen, const uint8_t* data,off_t datalen, off_t maxstart,off_t minstart, const off_t* sorted, off_t sortedlen, off_t* bestlen) { off_t match_up = match; off_t match_dn = match; while (match_up>=0 && sorted[match_up]>=maxstart && sorted[match_up]=maxstart && sorted[match_dn]=sortedlen) { if (match_up<0 && match_dn>=sortedlen) { *bestlen=0; return 0; } off_t pos = sorted[match_up<0 ? match_dn : match_up]; *bestlen = match_len(search, data+pos, min(searchlen, datalen-pos)); return pos; } return pick_best_of_two(search,searchlen, data,datalen, sorted[match_up],sorted[match_dn], bestlen); } static uint16_t read2_uc(const uint8_t* data) { return data[0]<<8 | data[1]; } template static uint16_t read2(const uint8_t* data, off_t len) { if (len>=2) return read2_uc(data); else { uint16_t out = (EOF_IS_LAST ? 0xFFFF : 0x0000); if (len==1) out = (data[0]<<8) | (out&0x00FF); return out; } } template static void create_buckets(const uint8_t* data, off_t* index, off_t len, off_t* buckets) { off_t low = 0; off_t high; for (int n=0;n<65536;n++) { //'low' remains from the previous iteration and is a known minimum high = low+(len/131072)+1; // optimal value: slightly above a third of the distance to the next one while (true) { if (high > len-1) break; off_t pos = index[high]; uint16_t here = read2(data+pos, len-pos); if (here >= n) break; else { off_t diff = high-low; low = high; high = high+diff*2; } } if (high > len-1) high = len-1; while (low < high) { off_t mid = low + (high-low)/2; off_t midpos = index[mid]; uint16_t here = read2(data+midpos, len-midpos); if (here < n) low = mid+1; else high = mid; } buckets[n] = low; } buckets[65536] = len; #ifdef TEST_CORRECT if (buckets[0]!=0) { printf("e: buckets suck, [0]=%i\n", buckets[0]); abort(); } for (int n=0;n<65536;n++) { off_t low = buckets[n]; off_t high = buckets[n+1]; for (off_t i=low;i static off_t find_index(off_t pos, const uint8_t* data, off_t datalen, const off_t* index, const off_t* buckets) { //if (datalen<2) return 0; uint16_t bucket = read2(data+pos, datalen-pos); //printf("p=%i b=%i\n",pos,bucket); //TODO //off_t low = 0; //off_t high = datalen-1; off_t low = buckets[bucket]; off_t high = buckets[bucket+1]-1; off_t lowmatch = 2; off_t highmatch = 2; //printf("b=%i r=%i(%i)-%i(%i)\n",bucket,low,read2(data+index[low],datalen-index[low]),high,read2(data+index[high],datalen-index[high])); //fflush(stdout); while (true) { off_t mid = low + (high-low)/2; off_t midpos = index[mid]; if (midpos == pos) return mid; //printf("r=[%i]%i-%i \n",high-low,low,high,); //fflush(stdout); #ifdef TEST_CORRECT if (low >= high) { printf("E: [%i](%i): stuck at %i(%i)-%i(%i)\n", pos, read2_uc(data+pos), low, read2_uc(data+index[low]), high, read2_uc(data+index[high])); int n=0; while (index[n]!=pos) n++; printf("correct one is %i(%i)\n",n, read2_uc(data+index[n])); abort(); } #endif off_t matchlenstart = min(lowmatch, highmatch); off_t len = datalen - max(pos, midpos) - matchlenstart; const uint8_t* search = data+pos+matchlenstart; const uint8_t* here = data+midpos+matchlenstart; //don't replace with match_len, it's not inlined properly while (len>0 && *search==*here) { search++; here++; len--; } off_t matchlen = search-data-pos; bool less; if (len > 0) less = (*here<*search); else less = (here > search) ^ EOF_IS_LAST; if (less) { low = mid+1; lowmatch = matchlen; } else { high = mid-1; highmatch = matchlen; } if (low+256 > high) { off_t i=low; while (true) { if (index[i]==pos) return i; i++; } } } } template static off_t nextsize(off_t outpos, off_t sortedsize, off_t targetlen) { while (outpos >= sortedsize-256 && sortedsize < targetlen) sortedsize = min(sortedsize*4+3, targetlen); return sortedsize; } template off_t lerp(off_t x, off_t y, float frac) { return x + (y-x)*frac; } template static result create_suf_core(const file& source, const file& target, struct bps_creator * out) { #define error(which) do { err = which; goto error; } while(0) result err; size_t realsourcelen = source.size(); size_t realtargetlen = target.size(); size_t overflowtest = realsourcelen + realtargetlen; //source+target length is bigger than size_t if (overflowtest < realsourcelen) return e_too_big; //source+target doesn't fit in unsigned off_t if ((size_t)(off_t)overflowtest != overflowtest) return e_too_big; //source+target doesn't fit in signed off_t if ((off_t)overflowtest < 0) return e_too_big; //the mallocs would overflow if (realsourcelen+realtargetlen >= SIZE_MAX/sizeof(off_t)) return e_too_big; if (realsourcelen+realtargetlen >= out->maxsize()) return e_too_big; off_t sourcelen = realsourcelen; off_t targetlen = realtargetlen; uint8_t* mem_joined = (uint8_t*)malloc(sizeof(uint8_t)*(realsourcelen+realtargetlen)); off_t* sorted = (off_t*)malloc(sizeof(off_t)*(realsourcelen+realtargetlen)); off_t* buckets = (off_t*)malloc(sizeof(off_t)*65537); if (!sorted || !mem_joined || !buckets) { free(mem_joined); free(sorted); free(buckets); return e_out_of_mem; } //sortedsize is how much of the target file is sorted off_t sortedsize = targetlen; //divide by 4 for each iteration, to avoid sorting 50% of the file (the sorter is slow) while (sortedsize/4 > sourcelen && sortedsize > 1024) sortedsize >>= 2; off_t prevsortedsize = 0; off_t outpos = 0; goto reindex; // jump into the middle so I won't need a special case to enter it while (outpos < targetlen) { if (outpos >= sortedsize-256 && sortedsize < targetlen) { sortedsize = nextsize(outpos, sortedsize, targetlen); reindex: //this isn't an exact science const float percSort = 0.50; const float percBuck = 0.10; //const float percFind = 0.40; // unused const size_t progPreSort = lerp(prevsortedsize, sortedsize, 0); const size_t progPreBuck = lerp(prevsortedsize, sortedsize, percSort); const size_t progPreFind = lerp(prevsortedsize, sortedsize, percSort+percBuck); prevsortedsize = sortedsize; if (out->progress(progPreSort, targetlen)) error(e_canceled); if (target.read(arrayvieww(mem_joined, sortedsize), 0) < (size_t)sortedsize) error(e_io); if (source.read(arrayvieww(mem_joined+sortedsize, sourcelen), 0) < (size_t)sourcelen) error(e_io); out->move_target(mem_joined); if (targetlen==0) goto emitempty; // size-0 targets are silly, but have to be handled sufsort(sorted, mem_joined, sortedsize+sourcelen); if (out->progress(progPreBuck, targetlen)) error(e_canceled); create_buckets(mem_joined, sorted, sortedsize+sourcelen, buckets); if (out->progress(progPreFind, targetlen)) error(e_canceled); } off_t matchlen = 0; off_t matchpos = adjust_match(find_index(outpos, mem_joined, sortedsize+sourcelen, sorted, buckets), mem_joined+outpos, sortedsize-outpos, mem_joined,sortedsize+sourcelen, outpos,sortedsize, sorted, sortedsize+sourcelen, &matchlen); #ifdef TEST_CORRECT if (matchlen && matchpos >= outpos && matchpos < sortedsize) puts("ERROR: found match in invalid location"),abort(); if (memcmp(mem_joined+matchpos, mem_joined+outpos, matchlen)) puts("ERROR: found match doesn't match"),abort(); #endif off_t taken; if (matchpos >= sortedsize) taken = out->match(false, matchpos-sortedsize, matchlen); else taken = out->match(true, matchpos, matchlen); #ifdef TEST_CORRECT if (taken < 0) puts("ERROR: match() returned negative"),abort(); if (matchlen >= 7 && taken < matchlen) printf("ERROR: match() took %i bytes, offered %i\n", taken, matchlen),abort(); #endif outpos += taken; } emitempty: out->finish(mem_joined+sortedsize, mem_joined); err = e_ok; error: free(buckets); free(sorted); free(mem_joined); return err; } template static result create_suf_pick(const file& source, const file& target, struct bps_creator * bps); template<> result create_suf_pick(const file& source, const file& target, struct bps_creator * bps) { return create_suf_core(source, target, bps); } template<> result create_suf_pick(const file& source, const file& target, struct bps_creator * bps) { result err = create_suf_core(source, target, bps); if (err==e_too_big) err = create_suf_core(source, target, bps); return err; } //This one picks a function based on 32-bit integers if that fits. This halves memory use for common inputs. //It also handles some stuff related to the BPS headers and footers. result create(const file& source, const file& target, const file& metadata, array& patch, function progress) { bps_creator bps(source, target, metadata); bps.progress = progress; size_t maindata = bps.outlen(); //off_t must be signed result err = create_suf_pick(source, target, &bps); if (err!=e_ok) return err; patch = bps.getpatch(); while ((patch[maindata]&0x80) == 0x00) maindata++; if (maindata==patch.size()-12-1) return e_identical; return e_ok; } #ifdef BPS_STANDALONE #include static struct mem ReadWholeFile(const char * filename) { struct mem null = {NULL, 0}; FILE * file=fopen(filename, "rb"); if (!file) return null; fseek(file, 0, SEEK_END); size_t len=ftell(file); fseek(file, 0, SEEK_SET); unsigned char * data=(unsigned char*)malloc(len); size_t truelen=fread(data, 1,len, file); fclose(file); if (len!=truelen) { free(data); return null; } struct mem ret = { (unsigned char*)data, len }; return ret; } static bool WriteWholeFile(const char * filename, struct mem data) { FILE * file=fopen(filename, "wb"); if (!file) return false; unsigned int truelen=fwrite(data.ptr, 1,data.len, file); fclose(file); return (truelen==data.len); } int main(int argc, char * argv[]) { //struct mem out = ReadWholeFile(argv[2]); //printf("check=%.8X\n",crc32(out.ptr, out.len)); struct mem in = ReadWholeFile(argv[1]); struct mem out = ReadWholeFile(argv[2]); struct mem null = {NULL, 0}; struct mem p={NULL,0}; //int n=50; //for(int i=0;i