diff --git a/README.md b/README.md index 1f72fca..a250bd5 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,12 @@ An encoder/decoder for Konami's binary XML format, used in some of their games. +### Setup: +`pip install bitarray` + +Python 2 only: +`pip install future` + ```python In [1]: from kbinxml import KBinXML In [2]: text = KBinXML('Hello, world!') diff --git a/bytebuffer.py b/bytebuffer.py index dbbfe95..4695eca 100644 --- a/bytebuffer.py +++ b/bytebuffer.py @@ -7,6 +7,8 @@ class ByteBuffer(): if isinstance(input, bytearray): self.data = input else: + if not isinstance(input, bytes): + input = input.encode('utf-8') self.data = bytearray(input) self.endian = endian self.offset = offset @@ -34,14 +36,14 @@ class ByteBuffer(): def append(self, data, type, count = None): fmt = self._format_type(type, count) self.offset += calcsize(fmt) - if isinstance(data, list) or isinstance(data, bytes) and type != 's': + if count and count > 1 or isinstance(data, list): self.data.extend(pack(fmt, *data)) else: self.data.extend(pack(fmt, data)) def set(self, data, offset, type, count = None): fmt = self._format_type(type, count) - if isinstance(data, list) or isinstance(data, bytes) and type != 's': + if count and count > 1 or isinstance(data, list): pack_into(fmt, self.data, offset, *data) else: pack_into(fmt, self.data, offset, data) @@ -92,7 +94,7 @@ def _make_set(fmt): return self.set(data, offset, fmt) return _method -for name, fmt in typeMap.iteritems(): +for name, fmt in typeMap.items(): _get = _make_get(fmt) _peek = _make_peek(fmt) _append = _make_append(fmt) diff --git a/format_ids.py b/format_ids.py index 4767fa5..9c501b7 100644 --- a/format_ids.py +++ b/format_ids.py @@ -1,68 +1,87 @@ +from struct import pack, unpack + +def parseIP(string): + bunch = map(int, string.split('.')) + # pack to bytes + p = pack('4B', *bunch) + # unpack as u16 + return unpack('>I', p)[0] + +def writeIP(raw): + # pack to bytes + p = pack('>I', raw) + # unpack + return '.'.join(map(str, unpack('4B', p))) + +def writeFloat(raw): + # this is just how floats get printed... + return '{0:.6f}'.format(raw) + xml_formats = { - 1 : { 'type' : None, 'count' : None, 'pyType' : None, 'names' : ['void']}, - 2 : { 'type' : 'b', 'count' : 1, 'pyType' : int, 'names' : ['s8']}, - 3 : { 'type' : 'B', 'count' : 1, 'pyType' : int, 'names' : ['u8']}, - 4 : { 'type' : 'h', 'count' : 1, 'pyType' : int, 'names' : ['s16']}, - 5 : { 'type' : 'H', 'count' : 1, 'pyType' : int, 'names' : ['u16']}, - 6 : { 'type' : 'i', 'count' : 1, 'pyType' : int, 'names' : ['s32']}, - 7 : { 'type' : 'I', 'count' : 1, 'pyType' : int, 'names' : ['u32']}, - 8 : { 'type' : 'q', 'count' : 1, 'pyType' : int, 'names' : ['s64']}, - 9 : { 'type' : 'Q', 'count' : 1, 'pyType' : int, 'names' : ['u64']}, - 10 : { 'type' : 'c', 'count' : -1, 'pyType' : None, 'names' : ['bin', 'binary'], 'delimiter' : ''}, - 11 : { 'type' : 's', 'count' : -1, 'pyType' : None, 'names' : ['str', 'string'], 'delimiter' : ''}, - 12 : { 'type' : 'B', 'count' : 4, 'pyType' : int, 'names' : ['ip4'], 'delimiter' : '.'}, - 13 : { 'type' : 'I', 'count' : 1, 'pyType' : int, 'names' : ['time']}, # todo: how to print - 14 : { 'type' : 'f', 'count' : 1, 'pyType' : float, 'names' : ['float', 'f']}, - 15 : { 'type' : 'd', 'count' : 1, 'pyType' : float, 'names' : ['double', 'd']}, - 16 : { 'type' : 'b', 'count' : 2, 'pyType' : int, 'names' : ['2s8']}, - 17 : { 'type' : 'B', 'count' : 2, 'pyType' : int, 'names' : ['2u8']}, - 18 : { 'type' : 'h', 'count' : 2, 'pyType' : int, 'names' : ['2s16']}, - 19 : { 'type' : 'H', 'count' : 2, 'pyType' : int, 'names' : ['2u16']}, - 20 : { 'type' : 'i', 'count' : 2, 'pyType' : int, 'names' : ['2s32']}, - 21 : { 'type' : 'I', 'count' : 2, 'pyType' : int, 'names' : ['2u32']}, - 22 : { 'type' : 'q', 'count' : 2, 'pyType' : int, 'names' : ['2s64', 'vs64']}, - 23 : { 'type' : 'Q', 'count' : 2, 'pyType' : int, 'names' : ['2u64', 'vu64']}, - 24 : { 'type' : 'f', 'count' : 2, 'pyType' : float, 'names' : ['2f']}, - 25 : { 'type' : 'd', 'count' : 2, 'pyType' : float, 'names' : ['2d', 'vd']}, - 26 : { 'type' : 'b', 'count' : 3, 'pyType' : int, 'names' : ['3s8']}, - 27 : { 'type' : 'B', 'count' : 3, 'pyType' : int, 'names' : ['3u8']}, - 28 : { 'type' : 'h', 'count' : 3, 'pyType' : int, 'names' : ['3s16']}, - 29 : { 'type' : 'H', 'count' : 3, 'pyType' : int, 'names' : ['3u16']}, - 30 : { 'type' : 'i', 'count' : 3, 'pyType' : int, 'names' : ['3s32']}, - 31 : { 'type' : 'I', 'count' : 3, 'pyType' : int, 'names' : ['3u32']}, - 32 : { 'type' : 'q', 'count' : 3, 'pyType' : int, 'names' : ['3s64']}, - 33 : { 'type' : 'Q', 'count' : 3, 'pyType' : int, 'names' : ['3u64']}, - 34 : { 'type' : 'f', 'count' : 3, 'pyType' : float, 'names' : ['3f']}, - 35 : { 'type' : 'd', 'count' : 3, 'pyType' : float, 'names' : ['3d']}, - 36 : { 'type' : 'b', 'count' : 4, 'pyType' : int, 'names' : ['4s8']}, - 37 : { 'type' : 'B', 'count' : 4, 'pyType' : int, 'names' : ['4u8']}, - 38 : { 'type' : 'h', 'count' : 4, 'pyType' : int, 'names' : ['4s16']}, - 39 : { 'type' : 'H', 'count' : 4, 'pyType' : int, 'names' : ['4u16']}, - 40 : { 'type' : 'i', 'count' : 4, 'pyType' : int, 'names' : ['4s32', 'vs32']}, - 41 : { 'type' : 'I', 'count' : 4, 'pyType' : int, 'names' : ['4u32', 'vu32']}, - 42 : { 'type' : 'q', 'count' : 4, 'pyType' : int, 'names' : ['4s64']}, - 43 : { 'type' : 'Q', 'count' : 4, 'pyType' : int, 'names' : ['4u64']}, - 44 : { 'type' : 'f', 'count' : 4, 'pyType' : float, 'names' : ['4f', 'vf']}, - 45 : { 'type' : 'd', 'count' : 4, 'pyType' : float, 'names' : ['4d']}, - 46 : { 'type' : None, 'count' : None, 'pyType' : None, 'names' : ['attr']}, - #47 : { 'type' : None, 'count' : None, 'pyType' : None, 'names' : ['array']}, - 48 : { 'type' : 'b', 'count' : 16, 'pyType' : int, 'names' : ['vs8']}, - 49 : { 'type' : 'B', 'count' : 16, 'pyType' : int, 'names' : ['vu8']}, - 50 : { 'type' : 'h', 'count' : 8, 'pyType' : int, 'names' : ['vs16']}, - 51 : { 'type' : 'H', 'count' : 8, 'pyType' : int, 'names' : ['vu16']}, - 52 : { 'type' : 'b', 'count' : 1, 'pyType' : int, 'names' : ['bool', 'b']}, - 53 : { 'type' : 'b', 'count' : 2, 'pyType' : int, 'names' : ['2b']}, - 54 : { 'type' : 'b', 'count' : 3, 'pyType' : int, 'names' : ['3b']}, - 55 : { 'type' : 'b', 'count' : 4, 'pyType' : int, 'names' : ['4b']}, - 56 : { 'type' : 'b', 'count' : 16, 'pyType' : int, 'names' : ['vb']} + 1 : { 'names' : ['void']}, + 2 : { 'type' : 'b', 'count' : 1, 'names' : ['s8']}, + 3 : { 'type' : 'B', 'count' : 1, 'names' : ['u8']}, + 4 : { 'type' : 'h', 'count' : 1, 'names' : ['s16']}, + 5 : { 'type' : 'H', 'count' : 1, 'names' : ['u16']}, + 6 : { 'type' : 'i', 'count' : 1, 'names' : ['s32']}, + 7 : { 'type' : 'I', 'count' : 1, 'names' : ['u32']}, + 8 : { 'type' : 'q', 'count' : 1, 'names' : ['s64']}, + 9 : { 'type' : 'Q', 'count' : 1, 'names' : ['u64']}, + 10 : { 'type' : 'B', 'count' : -1, 'names' : ['bin', 'binary'], 'fromStr' : None}, + 11 : { 'type' : 'B', 'count' : -1, 'names' : ['str', 'string'], 'fromStr' : None}, + 12 : { 'type' : 'I', 'count' : 1, 'names' : ['ip4'], 'fromStr' : parseIP, 'toStr' : writeIP}, + 13 : { 'type' : 'I', 'count' : 1, 'names' : ['time']}, # unix timestamp + 14 : { 'type' : 'f', 'count' : 1, 'names' : ['float', 'f'], 'fromStr' : float, 'toStr' : writeFloat}, + 15 : { 'type' : 'd', 'count' : 1, 'names' : ['double', 'd'], 'fromStr' : float, 'toStr' : writeFloat}, + 16 : { 'type' : 'b', 'count' : 2, 'names' : ['2s8']}, + 17 : { 'type' : 'B', 'count' : 2, 'names' : ['2u8']}, + 18 : { 'type' : 'h', 'count' : 2, 'names' : ['2s16']}, + 19 : { 'type' : 'H', 'count' : 2, 'names' : ['2u16']}, + 20 : { 'type' : 'i', 'count' : 2, 'names' : ['2s32']}, + 21 : { 'type' : 'I', 'count' : 2, 'names' : ['2u32']}, + 22 : { 'type' : 'q', 'count' : 2, 'names' : ['2s64', 'vs64']}, + 23 : { 'type' : 'Q', 'count' : 2, 'names' : ['2u64', 'vu64']}, + 24 : { 'type' : 'f', 'count' : 2, 'names' : ['2f'], 'fromStr' : float, 'toStr' : writeFloat}, + 25 : { 'type' : 'd', 'count' : 2, 'names' : ['2d', 'vd'], 'fromStr' : float, 'toStr' : writeFloat}, + 26 : { 'type' : 'b', 'count' : 3, 'names' : ['3s8']}, + 27 : { 'type' : 'B', 'count' : 3, 'names' : ['3u8']}, + 28 : { 'type' : 'h', 'count' : 3, 'names' : ['3s16']}, + 29 : { 'type' : 'H', 'count' : 3, 'names' : ['3u16']}, + 30 : { 'type' : 'i', 'count' : 3, 'names' : ['3s32']}, + 31 : { 'type' : 'I', 'count' : 3, 'names' : ['3u32']}, + 32 : { 'type' : 'q', 'count' : 3, 'names' : ['3s64']}, + 33 : { 'type' : 'Q', 'count' : 3, 'names' : ['3u64']}, + 34 : { 'type' : 'f', 'count' : 3, 'names' : ['3f'], 'fromStr' : float, 'toStr' : writeFloat}, + 35 : { 'type' : 'd', 'count' : 3, 'names' : ['3d'], 'fromStr' : float, 'toStr' : writeFloat}, + 36 : { 'type' : 'b', 'count' : 4, 'names' : ['4s8']}, + 37 : { 'type' : 'B', 'count' : 4, 'names' : ['4u8']}, + 38 : { 'type' : 'h', 'count' : 4, 'names' : ['4s16']}, + 39 : { 'type' : 'H', 'count' : 4, 'names' : ['4u16']}, + 40 : { 'type' : 'i', 'count' : 4, 'names' : ['4s32', 'vs32']}, + 41 : { 'type' : 'I', 'count' : 4, 'names' : ['4u32', 'vu32']}, + 42 : { 'type' : 'q', 'count' : 4, 'names' : ['4s64']}, + 43 : { 'type' : 'Q', 'count' : 4, 'names' : ['4u64']}, + 44 : { 'type' : 'f', 'count' : 4, 'names' : ['4f', 'vf'], 'fromStr' : float, 'toStr' : writeFloat}, + 45 : { 'type' : 'd', 'count' : 4, 'names' : ['4d'], 'fromStr' : float, 'toStr' : writeFloat}, + 46 : { 'names' : ['attr']}, + #47 : { 'names' : ['array']}, # TODO: how does this work? + 48 : { 'type' : 'b', 'count' : 16, 'names' : ['vs8']}, + 49 : { 'type' : 'B', 'count' : 16, 'names' : ['vu8']}, + 50 : { 'type' : 'h', 'count' : 8, 'names' : ['vs16']}, + 51 : { 'type' : 'H', 'count' : 8, 'names' : ['vu16']}, + 52 : { 'type' : 'b', 'count' : 1, 'names' : ['bool', 'b']}, + 53 : { 'type' : 'b', 'count' : 2, 'names' : ['2b']}, + 54 : { 'type' : 'b', 'count' : 3, 'names' : ['3b']}, + 55 : { 'type' : 'b', 'count' : 4, 'names' : ['4b']}, + 56 : { 'type' : 'b', 'count' : 16, 'names' : ['vb']} } # little less boilerplate for writing -for key, val in xml_formats.iteritems(): +for key, val in xml_formats.items(): xml_formats[key]['name'] = xml_formats[key]['names'][0] xml_types = {} -for key, val in xml_formats.iteritems(): +for key, val in xml_formats.items(): for n in val['names']: xml_types[n] = key xml_types['nodeStart'] = 1 diff --git a/kbinxml.py b/kbinxml.py index 6947775..5b63429 100644 --- a/kbinxml.py +++ b/kbinxml.py @@ -1,3 +1,5 @@ +# python 3 style, ints instead of b'' +from builtins import bytes from xml.dom import minidom from struct import calcsize import string @@ -8,6 +10,8 @@ from bytebuffer import ByteBuffer from sixbit import pack_sixbit, unpack_sixbit from format_ids import xml_formats, xml_types +stdout = getattr(sys.stdout, 'buffer', sys.stdout) + DEBUG_OFFSETS = False DEBUG = False @@ -32,7 +36,7 @@ encoding_vals = {val : key for key, val in encoding_strings.items()} def debug_print(string): if DEBUG: - print string + print(string) class KBinXML(): @@ -69,15 +73,10 @@ class KBinXML(): def data_grab_string(self): data = self.data_grab_auto() - res = '' - for b in data: - if b == 0: - break - res += chr(b) - return res.decode(self.encoding) + return bytes(data[:-1]).decode(self.encoding) def data_append_string(self, string): - string = string.encode(self.encoding) + '\0' + string = bytes(string.encode(self.encoding) + b'\0') self.data_append_auto(string) # has its own separate state and other assorted garbage @@ -122,6 +121,8 @@ class KBinXML(): self.dataBuf.realign_writes() def _node_to_binary(self, node): + if node.nodeType == node.TEXT_NODE or node.nodeType == node.COMMENT_NODE: + return nodeType = node.getAttribute('__type') if not nodeType: nodeType = 'void' @@ -145,10 +146,12 @@ class KBinXML(): if fmt['name'] == 'bin': data = bytes(bytearray.fromhex(val)) elif fmt['name'] == 'str': - data = val.encode(self.encoding) + '\0' + data = bytes(val.encode(self.encoding) + b'\0') else: - val = val.split(fmt.get('delimiter', ' ')) - data = map(fmt['pyType'], val) + val = val.split(' ') + data = list(map(fmt.get('fromStr', int), val)) + if count and len(data) / fmt['count'] != count: + raise ValueError('Array length does not match __count attribute') if isArray or fmt['count'] == -1: self.dataBuf.append_u32(len(data) * calcsize(fmt['type'])) @@ -157,7 +160,7 @@ class KBinXML(): else: self.data_append_aligned(data, fmt['type'], fmt['count']) - # for consistency and to be more faithful + # for test consistency and to be more faithful, sort the attrs sorted_attrs = sorted(node.attributes.items(), key=operator.itemgetter(0)) for key, value in sorted_attrs: if key not in ['__type', '__size', '__count']: @@ -166,8 +169,7 @@ class KBinXML(): pack_sixbit(key, self.nodeBuf) for child in node.childNodes: - if child.nodeType != child.TEXT_NODE: - self._node_to_binary(child) + self._node_to_binary(child) # always has the isArray bit set self.nodeBuf.append_u8(xml_types['nodeEnd'] | 64) @@ -270,42 +272,42 @@ class KBinXML(): node.setAttribute('__type', nodeFormat['name']) - if isArray: - arrayCount = self.dataBuf.get_u32() / calcsize(nodeFormat['type']) - node.setAttribute('__count', str(arrayCount)) - else: - arrayCount = 1 varCount = nodeFormat['count'] - if varCount == -1: + arrayCount = 1 + if varCount == -1: # the 2 cannot be combined varCount = self.dataBuf.get_u32() + isArray = True + elif isArray: + arrayCount = self.dataBuf.get_u32() // (calcsize(nodeFormat['type'] * varCount)) + node.setAttribute('__count', str(arrayCount)) totalCount = arrayCount * varCount - delim = nodeFormat.get('delimiter', ' ') - - if isArray or nodeFormat['count'] == -1: + if isArray: data = self.dataBuf.get(nodeFormat['type'], totalCount) self.dataBuf.realign_reads() else: data = self.data_grab_aligned(nodeFormat['type'], totalCount) - string = delim.join(map(str, data)) if nodeType == xml_types['binary']: node.setAttribute('__size', str(totalCount)) - string = ''.join(('{0:02x}'.format(ord(x)) for x in string)) + string = ''.join(('{0:02x}'.format(x) for x in data)) elif nodeType == xml_types['string']: - string = string[:-1].decode(self.encoding) + string = bytes(data[:-1]).decode(self.encoding) + else: + string = ' '.join(map(nodeFormat.get('toStr', str), data)) node.appendChild(self.xml_doc.createTextNode(string)) if __name__ == '__main__': if len(sys.argv) != 2: - print 'bin_xml.py file.[xml/bin]' + print('bin_xml.py file.[xml/bin]') + exit() - with open(sys.argv[1:], 'rb') as f: + with open(sys.argv[1], 'rb') as f: input = f.read() xml = KBinXML(input) if KBinXML.is_binary_xml(input): - print xml.to_text() + stdout.write(xml.to_text()) else: - print xml.to_binary() + stdout.write(xml.to_binary()) diff --git a/sixbit.py b/sixbit.py index bc45a89..8bd22b4 100644 --- a/sixbit.py +++ b/sixbit.py @@ -1,27 +1,28 @@ +# python 3 style, ints instead of b'' +from builtins import bytes from bitarray import bitarray def pack_sixbit(string, byteBuf): chars = str_to_sixbit(string) bits = bitarray(endian='big') for c in chars: - bits.frombytes(c) + bits.frombytes(c.encode()) # leave only the 6 bits we care for del bits[-8:-6] - data = bits.tobytes() + data = bytes(bits.tobytes()) byteBuf.append_u8(len(string)) - byteBuf.append(data, 'c', len(data)) + byteBuf.append(data, 'B', len(data)) def unpack_sixbit(byteBuf): - bitBuf = bitarray(endian='big') - bitBuf.frombytes(bytes(byteBuf.data)) length = byteBuf.get_u8() + length_bytes = (length * 6 + 7) // 8 + bitBuf = bitarray(endian='big') + bitBuf.frombytes(bytes(byteBuf.get('B', length_bytes))) result = [] - offset = byteBuf.offset * 8 + offset = 0 for i in range(length): result.append(ord(bitBuf[offset:offset+6].tobytes()) >> (8 - 6)) offset += 6 - # padding - byteBuf.offset += (length * 6 + 7) // 8 return sixbit_to_str(result) # 0-9 for numbers, 10 is ':', 11 to 36 for capitals, 37 for underscore, 38-63 for lowercase