From c39b50a2362cb104b9fbbec755d653d67aaea0a8 Mon Sep 17 00:00:00 2001 From: William Toohey Date: Tue, 13 Jun 2017 23:52:50 +1000 Subject: [PATCH] Fix various issues, move to class structure --- bin_xml.py | 593 ++++++++++++++++++++++++++------------------------ bytebuffer.py | 42 +++- format_ids.py | 2 +- 3 files changed, 344 insertions(+), 293 deletions(-) diff --git a/bin_xml.py b/bin_xml.py index 555d28a..61d2507 100644 --- a/bin_xml.py +++ b/bin_xml.py @@ -4,8 +4,7 @@ import string from bitarray import bitarray from bytebuffer import ByteBuffer from format_ids import xml_formats, xml_types - -import IPython +import sys DEBUG_OFFSETS = False DEBUG = False @@ -25,301 +24,331 @@ def debug_print(string): if DEBUG: print string -def pack_bits(string, nodeBuf, bits = 6): - chars = str_to_sixbit(string) - bits = bitarray(endian='big') - for c in chars: - bits.frombytes(c) - del bits[-8:-6] - for c in bits.tobytes(): - nodeBuf.append_u8(ord(c)) +class kbinxml(): -def unpack_bits(bitArray, byteBuf, length, bits = 6): - result = [] - offset = byteBuf.offset * 8 - for i in range(length): - result.append(ord(bitArray[offset:offset+bits].tobytes()) >> (8 - bits)) - offset += bits - # padding - byteBuf.offset += (length * bits + 7) // 8 - return sixbit_to_str(result) - -# 0-9 for numbers, 10 to 36 for capitals, 37 for underscore, 38-63 for lowercase -def sixbit_to_str(decompressed): - string = '' - for d in decompressed: - if d <= 10: - d += ord('0') - elif d < 37: - d += 54 - elif d == 37: - d += 58 + def __init__(self, input): + if isinstance(input, minidom.Document): + self.xml_doc = input + elif self.is_binary_xml(input): + self.from_binary(input) else: - d += 59 - string += chr(d) - return string + self.from_text(input) -def str_to_sixbit(string): - compress = [] - for c in string: - if c >= '0' and c <= '9': - compress.append(ord(c) - ord('0')) - elif c >= 'A' and c <= 'Z': - compress.append(ord(c) - 54) - elif c == '_': - compress.append(ord(c) - 58) - elif c >= 'a' and c <= 'z': - compress.append(ord(c) - 59) - else: - raise ValueError('Node name can only contain alphanumeric + underscore') - return ''.join(map(chr, compress)) + def pack_bits(self, string, bits = 6): + chars = self.str_to_sixbit(string) + bits = bitarray(endian='big') + for c in chars: + bits.frombytes(c) + del bits[-8:-6] + for c in bits.tobytes(): + self.nodeBuf.append_u8(ord(c)) -def data_grab_auto(dataBuf): - size = dataBuf.get_s32() - ret = [dataBuf.get_u8() for x in range(size)] - # padding - dataBuf.offset += 3 - # round to dword - dataBuf.offset &= ~0b11 - return ret + def unpack_bits(self, length, bits = 6): + result = [] + offset = self.nodeBuf.offset * 8 + for i in range(length): + result.append(ord(self.nodeBits[offset:offset+bits].tobytes()) >> (8 - bits)) + offset += bits + # padding + self.nodeBuf.offset += (length * bits + 7) // 8 + return self.sixbit_to_str(result) -def data_append_auto(dataBuf, data): - dataBuf.append_s32(len(data)) - dataBuf.append(data, 's', len(data)) - - # padding - while len(dataBuf) % 4: - dataBuf.append_u8(0) - -def data_append_string(dataBuf, string): - string = string.encode('shift_jisx0213') - data_append_auto(dataBuf, string) - -def data_grab_string(dataBuf): - data = data_grab_auto(dataBuf) - res = '' - for b in data: - if b == 0: - break - res += chr(b) - return res.decode('shift_jisx0213') - -# has its own separate state and other assorted garbage -def data_grab_aligned(dataBuf, dataByteBuf, dataWordBuf, type, count): - if dataByteBuf.offset % 4 == 0: - dataByteBuf.offset = dataBuf.offset - if dataWordBuf.offset % 4 == 0: - dataWordBuf.offset = dataBuf.offset - # multiply by count since 2u2 reads from the 16 bit buffer, for example - size = calcsize(type) * count - if size == 1: - ret = dataByteBuf.get(type, count) - elif size == 2: - ret = dataWordBuf.get(type, count) - else: - ret = dataBuf.get(type, count) - trailing = max(dataByteBuf.offset, dataWordBuf.offset) - if dataBuf.offset < trailing: - dataBuf.offset = trailing + 3 - dataBuf.offset &= ~0b11 - return ret - -def is_binary_xml(input): - nodeBuf = ByteBuffer(input) - return nodeBuf.get_u16() == SIGNATURE - -def _xml_node_to_binary(node, nodeBuf, dataBuf): - nodeType = node.getAttribute('__type') - if not nodeType: - nodeType = 'void' - nodeId = xml_types[nodeType] - - isArray = 0 - count = node.getAttribute('__count') - if count: - count = int(count) - isArray = 64 # bit position for array flag - - nodeBuf.append_u8(nodeId | isArray) - - name = node.nodeName - nodeBuf.append_u8(len(name)) - pack_bits(name, nodeBuf) - - import operator - sorted_x = sorted(node.attributes.items(), key=operator.itemgetter(0)) - for key, value in sorted_x:#node.attributes.items(): - if key in ['__type', '__size', '__count']: - pass - else: - data_append_string(dataBuf, value) - nodeBuf.append_u8(xml_types['attr']) - nodeBuf.append_u8(len(key)) - pack_bits(key, nodeBuf) - - if nodeType != 'void': - nodeId = xml_types[nodeType] - fmt = xml_formats[nodeId] - - data = map(fmt['pType'], node.firstChild.nodeValue.split(fmt.get('delimiter', ' '))) - - if fmt['count'] == -1 or not isArray: - data = data[0] - if isArray or fmt['count'] == -1: - dataBuf.append_u32(len(data)) - if isArray: - for d in data: - dataBuf.append(d, fmt['type']) + # 0-9 for numbers, 10 to 36 for capitals, 37 for underscore, 38-63 for lowercase + def sixbit_to_str(self, decompressed): + string = '' + for d in decompressed: + if d <= 10: + d += ord('0') + elif d < 37: + d += 54 + elif d == 37: + d += 58 else: - dataBuf.append(data, fmt['type']) + d += 59 + string += chr(d) + return string + + def str_to_sixbit(self, string): + compress = [] + for c in string: + if c >= '0' and c <= '9': + compress.append(ord(c) - ord('0')) + elif c >= 'A' and c <= 'Z': + compress.append(ord(c) - 54) + elif c == '_': + compress.append(ord(c) - 58) + elif c >= 'a' and c <= 'z': + compress.append(ord(c) - 59) + else: + raise ValueError('Node name can only contain alphanumeric + underscore') + return ''.join(map(chr, compress)) + + def data_grab_auto(self): + size = self.dataBuf.get_s32() + ret = [self.dataBuf.get_u8() for x in range(size)] + # padding + self.dataBuf.offset += 3 + # round to dword + self.dataBuf.offset &= ~0b11 + return ret + + def data_append_auto(self, data): + self.dataBuf.append_s32(len(data)) + self.dataBuf.append(data, 's', len(data)) + + # padding + while len(self.dataBuf) % 4: + self.dataBuf.append_u8(0) + + def data_append_string(self, string): + string = string.encode('shift_jisx0213') + '\0' + self.data_append_auto(string) + + def data_grab_string(self): + data = self.data_grab_auto() + res = '' + for b in data: + if b == 0: + break + res += chr(b) + return res.decode('shift_jisx0213') + + # has its own separate state and other assorted garbage + def data_grab_aligned(self, type, count): + if self.dataByteBuf.offset % 4 == 0: + self.dataByteBuf.offset = self.dataBuf.offset + if self.dataWordBuf.offset % 4 == 0: + self.dataWordBuf.offset = self.dataBuf.offset + # multiply by count since 2u2 reads from the 16 bit buffer, for example + size = calcsize(type) * count + if size == 1: + ret = self.dataByteBuf.get(type, count) + elif size == 2: + ret = self.dataWordBuf.get(type, count) else: - data_append_aligned(dataBuf, dataByteBuf, dataWordBuf, fmt['type'], fmt['count']) + ret = self.dataBuf.get(type, count) + trailing = max(self.dataByteBuf.offset, self.dataWordBuf.offset) + if self.dataBuf.offset < trailing: + self.dataBuf.offset = trailing + 3 + self.dataBuf.offset &= ~0b11 + return ret - for child in node.childNodes: - if child.nodeType != child.TEXT_NODE: - _xml_node_to_binary(child, nodeBuf, dataBuf) - - nodeBuf.append_u8(xml_types['nodeEnd'] | 64) - -def xml_text_to_binary(input): - return xml_to_binary(minidom.parseString(input)) - -def xml_to_binary(input): - header = ByteBuffer() - header.append_u16(SIGNATURE) - header.append_u8(4 << 5) # SHIFT-JIS TODO make encoding variable - header.append_u8(0x7F) # TODO what does this do as 7f or ff - nodeBuf = ByteBuffer() - dataBuf = ByteBuffer() - - for child in input.childNodes: - _xml_node_to_binary(child, nodeBuf, dataBuf) - - nodeBuf.append_u8(xml_types['endSection'] | 64) - while len(nodeBuf) % 4 != 0: - nodeBuf.append_u8(0) - header.append_u32(len(nodeBuf)) - nodeBuf.append_u32(len(dataBuf)) - return header.data + nodeBuf.data + dataBuf.data - -def binary_to_xml_text(input): - return binary_to_xml(input).toprettyxml(indent=" ", encoding='UTF-8') - -def binary_to_xml(input): - doc = minidom.Document() - node = doc - - nodeBuf = ByteBuffer(input) - assert nodeBuf.get_u16() == SIGNATURE - encoding = encodings[(nodeBuf.get_u8() & 0xE0) >> 5] - unknown = nodeBuf.get_u8() - - # creating bitarrays is slow, cache for speed - nodeBits = bitarray(endian='big') - nodeBits.frombytes(input) - - nodeEnd = nodeBuf.get_u32() + 8 - nodeBuf.end = nodeEnd - - dataBuf = ByteBuffer(input, nodeEnd) - dataSize = dataBuf.get_u32() - # WHY MUST YOU DO THIS TO ME - dataByteBuf = ByteBuffer(input, nodeEnd) - dataWordBuf = ByteBuffer(input, nodeEnd) - - nodesLeft = True - while nodesLeft and nodeBuf.hasData(): - while nodeBuf.peek_u8() == 0: - debug_print("Skipping 0 node ID") - nodeBuf.get_u8() - - nodeType = nodeBuf.get_u8() - isArray = nodeType & 64 - nodeType &= ~64 - - nodeFormat = xml_formats.get(nodeType, {'name':'Unknown'}) - debug_print('Node type is {} ({})'.format(nodeFormat['name'], nodeType)) - - # node name - name = '' - if nodeType != xml_types['nodeEnd'] and nodeType != xml_types['endSection']: - strLen = nodeBuf.get_u8() - name = unpack_bits(nodeBits, nodeBuf, strLen) - debug_print(name) - - skip = True - - if nodeType == xml_types['attr']: - value = data_grab_string(dataBuf) - node.setAttribute(name, value) - elif nodeType == xml_types['nodeEnd']: - if node.parentNode: - node = node.parentNode - elif nodeType == xml_types['endSection']: - nodesLeft = False - elif nodeType not in xml_formats: - raise NotImplementedError('Implement node {}'.format(nodeType)) - else: # inner value to process - skip = False - - if skip: - continue - - child = doc.createElement(name) - node.appendChild(child) - node = child - - if nodeType == xml_types['nodeStart']: - continue - - node.setAttribute('__type', nodeFormat['name']) - - if isArray: - arrayCount = dataBuf.get_u32() - node.setAttribute('__count', str(arrayCount)) + def data_append_aligned(self, data, type, count): + if self.dataByteBuf.offset % 4 == 0: + self.dataByteBuf.offset = self.dataBuf.offset + if self.dataWordBuf.offset % 4 == 0: + self.dataWordBuf.offset = self.dataBuf.offset + # multiply by count since 2u2 reads from the 16 bit buffer, for example + size = calcsize(type) * count + if size == 1: + # make room if fresh dword for our stuff + if self.dataByteBuf.offset % 4 == 0: + self.dataBuf.append_u32(0) + self.dataByteBuf.set(data, self.dataByteBuf.offset, type, count) + elif size == 2: + if self.dataWordBuf.offset % 4 == 0: + self.dataBuf.append_u32(0) + self.dataWordBuf.set(data, self.dataWordBuf.offset, type, count) else: - arrayCount = 1 - varCount = nodeFormat['count'] - if varCount == -1: - varCount = dataBuf.get_u32() - totalCount = arrayCount * varCount + self.dataBuf.append(data, type, count) - delim = nodeFormat.get('delimiter', ' ') + def is_binary_xml(self, input): + nodeBuf = ByteBuffer(input) + return nodeBuf.get_u16() == SIGNATURE - if isArray or nodeFormat['count'] == -1: - try: - data = dataBuf.get(nodeFormat['type'], totalCount) - except: - print doc.toprettyxml(indent=" ", encoding='UTF-8') - IPython.embed() - dataBuf.offset += 3 # padding - dataBuf.offset &= ~0b11 # align to dword - else: - data = data_grab_aligned(dataBuf, dataByteBuf, dataWordBuf, nodeFormat['type'], totalCount) - string = delim.join(map(str, data)) + def _node_to_binary(self, node): + nodeType = node.getAttribute('__type') + if not nodeType: + nodeType = 'void' + nodeId = xml_types[nodeType] - if nodeType == xml_types['binary']: - node.setAttribute('__size', str(totalCount)) - string = ''.join(('{0:02x}'.format(ord(x)) for x in string)) - if nodeType == xml_types['string']: - string = string[:-1].decode('shift_jisx0213') + isArray = 0 + count = node.getAttribute('__count') + if count: + count = int(count) + isArray = 64 # bit position for array flag - node.appendChild(doc.createTextNode(string)) + self.nodeBuf.append_u8(nodeId | isArray) - #print doc.toprettyxml(indent=" ", encoding='UTF-8') - return doc + name = node.nodeName + self.nodeBuf.append_u8(len(name)) + self.pack_bits(name) + import operator + sorted_x = sorted(node.attributes.items(), key=operator.itemgetter(0)) + for key, value in sorted_x:#node.attributes.items(): + if key in ['__type', '__size', '__count']: + pass + else: + self.data_append_string(value) + self.nodeBuf.append_u8(xml_types['attr']) + self.nodeBuf.append_u8(len(key)) + self.pack_bits(key) + + if nodeType != 'void': + fmt = xml_formats[nodeId] + + val = node.firstChild.nodeValue + if fmt['count'] != -1: + val = val.split(fmt.get('delimiter', ' ')) + data = map(fmt['pType'], val) + else: + data = fmt['pType'](val) + + if isArray or fmt['count'] == -1: + self.dataBuf.append_u32(len(data) * calcsize(fmt['type'])) + self.dataBuf.append(data, fmt['type'], len(data)) + # padding + while len(self.dataBuf) % 4: + self.dataBuf.append_u8(0) + else: + self.data_append_aligned(data, fmt['type'], fmt['count']) + + for child in node.childNodes: + if child.nodeType != child.TEXT_NODE: + self._node_to_binary(child) + + self.nodeBuf.append_u8(xml_types['nodeEnd'] | 64) + + def from_text(self, input): + self.xml_doc = minidom.parseString(input) + + def to_binary(self): + header = ByteBuffer() + header.append_u16(SIGNATURE) + header.append_u8(4 << 5) # SHIFT-JIS TODO make encoding variable + header.append_u8(0x7F) # TODO what does this do as 7f or ff + self.nodeBuf = ByteBuffer() + self.dataBuf = ByteBuffer() + self.dataByteBuf = ByteBuffer(self.dataBuf.data) + self.dataWordBuf = ByteBuffer(self.dataBuf.data) + + for child in self.xml_doc.childNodes: + self._node_to_binary(child) + + self.nodeBuf.append_u8(xml_types['endSection'] | 64) + while len(self.nodeBuf) % 4 != 0: + self.nodeBuf.append_u8(0) + header.append_u32(len(self.nodeBuf)) + self.nodeBuf.append_u32(len(self.dataBuf)) + return bytes(header.data + self.nodeBuf.data + self.dataBuf.data) + + def to_text(self): + return self.xml_doc.toprettyxml(indent=" ", encoding='UTF-8') + + def from_binary(self, input): + self.xml_doc = minidom.Document() + node = self.xml_doc + + self.nodeBuf = ByteBuffer(input) + assert self.nodeBuf.get_u16() == SIGNATURE + encoding = encodings[(self.nodeBuf.get_u8() & 0xE0) >> 5] + unknown = self.nodeBuf.get_u8() + + # creating bitarrays is slow, cache for speed + self.nodeBits = bitarray(endian='big') + self.nodeBits.frombytes(input) + + nodeEnd = self.nodeBuf.get_u32() + 8 + self.nodeBuf.end = nodeEnd + + self.dataBuf = ByteBuffer(input, nodeEnd) + dataSize = self.dataBuf.get_u32() + # WHY MUST YOU DO THIS TO ME + self.dataByteBuf = ByteBuffer(input, nodeEnd) + self.dataWordBuf = ByteBuffer(input, nodeEnd) + + nodesLeft = True + while nodesLeft and self.nodeBuf.hasData(): + while self.nodeBuf.peek_u8() == 0: + debug_print("Skipping 0 node ID") + self.nodeBuf.get_u8() + + nodeType = self.nodeBuf.get_u8() + isArray = nodeType & 64 + nodeType &= ~64 + + nodeFormat = xml_formats.get(nodeType, {'name':'Unknown'}) + debug_print('Node type is {} ({})'.format(nodeFormat['name'], nodeType)) + + # node name + name = '' + if nodeType != xml_types['nodeEnd'] and nodeType != xml_types['endSection']: + strLen = self.nodeBuf.get_u8() + name = self.unpack_bits(strLen) + debug_print(name) + + skip = True + + if nodeType == xml_types['attr']: + value = self.data_grab_string() + node.setAttribute(name, value) + elif nodeType == xml_types['nodeEnd']: + if node.parentNode: + node = node.parentNode + elif nodeType == xml_types['endSection']: + nodesLeft = False + elif nodeType not in xml_formats: + raise NotImplementedError('Implement node {}'.format(nodeType)) + else: # inner value to process + skip = False + + if skip: + continue + + child = self.xml_doc.createElement(name) + node.appendChild(child) + node = child + + if nodeType == xml_types['nodeStart']: + continue + + node.setAttribute('__type', nodeFormat['name']) + + if isArray: + arrayCount = self.dataBuf.get_u32() / calcsize(nodeFormat['type']) + node.setAttribute('__count', str(arrayCount)) + else: + arrayCount = 1 + varCount = nodeFormat['count'] + if varCount == -1: + varCount = self.dataBuf.get_u32() + totalCount = arrayCount * varCount + + delim = nodeFormat.get('delimiter', ' ') + + if isArray or nodeFormat['count'] == -1: + data = self.dataBuf.get(nodeFormat['type'], totalCount) + self.dataBuf.offset += 3 # padding + self.dataBuf.offset &= ~0b11 # align to dword + else: + data = self.data_grab_aligned(nodeFormat['type'], totalCount) + string = delim.join(map(str, data)) + + if nodeType == xml_types['binary']: + node.setAttribute('__size', str(totalCount)) + string = ''.join(('{0:02x}'.format(ord(x)) for x in string)) + if nodeType == xml_types['string']: + string = string[:-1].decode('shift_jisx0213') + + node.appendChild(self.xml_doc.createTextNode(string)) + + #print self.xml_doc.toprettyxml(indent=" ", encoding='UTF-8') if __name__ == '__main__': - #input = open('./dump/_core_model=KFC_J_A_A_2016121200_module=package_method=list_out.raw','rb').read() - #input = open('./dump/KFCmodelKFCJAA2016121200modulegame3methodcommon.raw','rb').read() - input = open('test.raw', 'rb').read() - xml = binary_to_xml(input) - binary = xml_to_binary(xml) - with open('out.raw', 'wb') as f: - f.write(binary) + if len(sys.argv) < 2: + print 'bin_xml.py file1 [file2 ...]' - #print [ord(x) for x in input] - #print [ord(x) for x in binary] - #print binary_to_xml_text(input) - print binary_to_xml_text(binary) + # by default, confirm the implementation is correct + for f in sys.argv[1:]: + with open(f, 'rb') as f: + input = f.read() + xml = kbinxml(input) + print xml.to_text() + try: + # just politely ignore the signature since we don't do encoding yet + assert xml.to_binary()[4:] == input[4:] + except AssertionError: + print 'Files do not match!' + with open('out.raw', 'wb') as f: + f.write(xml.to_binary()) diff --git a/bytebuffer.py b/bytebuffer.py index 386c11c..9f1a9f0 100644 --- a/bytebuffer.py +++ b/bytebuffer.py @@ -2,11 +2,20 @@ from struct import * class ByteBuffer(): def __init__(self, input = b'', offset = 0, endian = '>'): - self.data = input + if isinstance(input, bytearray): + self.data = input + else: + self.data = bytearray(input) self.endian = endian self.offset = offset self.end = len(self.data) + def _format_type(self, type, count): + if count is None: + return self.endian + type + else: + return self.endian + str(count) + type + def get(self, type, count = None): ret = self.peek(type, count) size = calcsize(type) @@ -16,19 +25,25 @@ class ByteBuffer(): return ret def peek(self, type, count = None): - if count is None: - fmt = self.endian + type - else: - fmt = self.endian + str(count) + type + fmt = self._format_type(type, count) ret = unpack(fmt, self.data[self.offset:self.offset+calcsize(fmt)]) return ret[0] if count is None else ret - def append(self, data, type, count = 1): - if count is None: - fmt = self.endian + type + def append(self, data, type, count = None): + fmt = self._format_type(type, count) + self.offset += calcsize(fmt) + if isinstance(data, list): + self.data.extend(pack(fmt, *data)) else: - fmt = self.endian + str(count) + type - self.data += pack(fmt, data) + self.data.extend(pack(fmt, data)) + + def set(self, data, offset, type, count = None): + fmt = self._format_type(type, count) + if isinstance(data, list): + pack_into(fmt, self.data, offset, *data) + else: + pack_into(fmt, self.data, offset, data) + self.offset += calcsize(fmt) def hasData(self): return self.offset < self.end @@ -62,10 +77,17 @@ def _make_append(fmt): return self.append(data, fmt) return _method +def _make_set(fmt): + def _method(self, data, offset): + return self.set(data, offset, fmt) + return _method + for name, fmt in typeMap.iteritems(): _get = _make_get(fmt) _peek = _make_peek(fmt) _append = _make_append(fmt) + _set = _make_set(fmt) setattr(ByteBuffer, 'get_' + name, _get) setattr(ByteBuffer, 'peek_' + name, _peek) setattr(ByteBuffer, 'append_' + name, _append) + setattr(ByteBuffer, 'set_' + name, _set) diff --git a/format_ids.py b/format_ids.py index 4c14806..b3b2559 100644 --- a/format_ids.py +++ b/format_ids.py @@ -1,6 +1,6 @@ def jisString(string): - return string.encode('shift_jisx0213') + return string.encode('shift_jisx0213') + '\0' xml_formats = { 1 : { 'type' : None, 'count' : None, 'pType' : None, 'names' : ['void']},