# python 3 style, ints instead of b'' from builtins import bytes from struct import calcsize import string import sys import operator from io import BytesIO import lxml.etree as etree from bytebuffer import ByteBuffer from sixbit import pack_sixbit, unpack_sixbit from format_ids import xml_formats, xml_types stdout = getattr(sys.stdout, 'buffer', sys.stdout) DEBUG_OFFSETS = False DEBUG = False SIGNATURE = 0xA0 SIG_COMPRESSED = 0x42 SIG_UNCOMPRESSED = 0x45 XML_ENCODING = 'UTF-8' BIN_ENCODING = 'SHIFT_JISX0213' # NOTE: all of these are their python codec names encoding_strings = { 0x20: 'ASCII', 0x00: 'ISO-8859-1', 0x60: 'EUC_JP', 0x80: 'SHIFT_JISX0213', 0xA0: 'UTF_8' } encoding_vals = {val : key for key, val in encoding_strings.items()} def debug_print(string): if DEBUG: print(string) class KBinXML(): def __init__(self, input): if isinstance(input, etree._Element): self.xml_doc = input elif isinstance(input, etree._ElementTree): self.xml_doc = input.getroot() elif KBinXML.is_binary_xml(input): self.from_binary(input) else: self.from_text(input) def to_text(self): # we decode again because I want unicode, dammit return etree.tostring(self.xml_doc, pretty_print=True, encoding=XML_ENCODING, xml_declaration=True).decode(XML_ENCODING) def from_text(self, input): self.xml_doc = etree.parse(BytesIO(input)).getroot() @staticmethod def is_binary_xml(input): nodeBuf = ByteBuffer(input) return (nodeBuf.get_u8() == SIGNATURE and nodeBuf.get_u8() in (SIG_COMPRESSED, SIG_UNCOMPRESSED)) def data_grab_auto(self): size = self.dataBuf.get_s32() ret = self.dataBuf.get('B', size) self.dataBuf.realign_reads() return ret def data_append_auto(self, data): self.dataBuf.append_s32(len(data)) self.dataBuf.append(data, 'B', len(data)) self.dataBuf.realign_writes() def data_grab_string(self): data = self.data_grab_auto() return bytes(data[:-1]).decode(self.encoding) def data_append_string(self, string): string = bytes(string.encode(self.encoding) + b'\0') self.data_append_auto(string) # has its own separate state and other assorted garbage def data_grab_aligned(self, type, count): if self.dataByteBuf.offset % 4 == 0: self.dataByteBuf.offset = self.dataBuf.offset if self.dataWordBuf.offset % 4 == 0: self.dataWordBuf.offset = self.dataBuf.offset # multiply by count since 2u2 reads from the 16 bit buffer, for example size = calcsize(type) * count if size == 1: ret = self.dataByteBuf.get(type, count) elif size == 2: ret = self.dataWordBuf.get(type, count) else: ret = self.dataBuf.get(type, count) self.dataBuf.realign_reads() trailing = max(self.dataByteBuf.offset, self.dataWordBuf.offset) if self.dataBuf.offset < trailing: self.dataBuf.offset = trailing self.dataBuf.realign_reads() return ret def data_append_aligned(self, data, type, count): if self.dataByteBuf.offset % 4 == 0: self.dataByteBuf.offset = self.dataBuf.offset if self.dataWordBuf.offset % 4 == 0: self.dataWordBuf.offset = self.dataBuf.offset # multiply by count since 2u2 reads from the 16 bit buffer, for example size = calcsize(type) * count if size == 1: # make room for our stuff if fresh dword if self.dataByteBuf.offset % 4 == 0: self.dataBuf.append_u32(0) self.dataByteBuf.set(data, self.dataByteBuf.offset, type, count) elif size == 2: if self.dataWordBuf.offset % 4 == 0: self.dataBuf.append_u32(0) self.dataWordBuf.set(data, self.dataWordBuf.offset, type, count) else: self.dataBuf.append(data, type, count) self.dataBuf.realign_writes() def _node_to_binary(self, node): nodeType = node.attrib.get('__type') if not nodeType: # typeless tags with text become string if node.text is not None and len(node.text.strip()) > 0: nodeType = 'str' else: nodeType = 'void' nodeId = xml_types[nodeType] isArray = 0 count = node.attrib.get('__count') if count: count = int(count) isArray = 64 # bit position for array flag self.nodeBuf.append_u8(nodeId | isArray) name = node.tag pack_sixbit(name, self.nodeBuf) if nodeType != 'void': fmt = xml_formats[nodeId] val = node.text if fmt['name'] == 'bin': data = bytes(bytearray.fromhex(val)) elif fmt['name'] == 'str': data = bytes(val.encode(self.encoding) + b'\0') else: val = val.split(' ') data = list(map(fmt.get('fromStr', int), val)) if count and len(data) / fmt['count'] != count: raise ValueError('Array length does not match __count attribute') if isArray or fmt['count'] == -1: self.dataBuf.append_u32(len(data) * calcsize(fmt['type'])) self.dataBuf.append(data, fmt['type'], len(data)) self.dataBuf.realign_writes() else: self.data_append_aligned(data, fmt['type'], fmt['count']) # for test consistency and to be more faithful, sort the attrs sorted_attrs = sorted(node.attrib.items(), key=operator.itemgetter(0)) for key, value in sorted_attrs: if key not in ['__type', '__size', '__count']: self.data_append_string(value) self.nodeBuf.append_u8(xml_types['attr']) pack_sixbit(key, self.nodeBuf) for child in node.iterchildren(tag=etree.Element): self._node_to_binary(child) # always has the isArray bit set self.nodeBuf.append_u8(xml_types['nodeEnd'] | 64) def to_binary(self): self.encoding = BIN_ENCODING header = ByteBuffer() header.append_u8(SIGNATURE) header.append_u8(SIG_COMPRESSED) header.append_u8(encoding_vals[self.encoding]) # Python's ints are big, so can't just bitwise invert header.append_u8(0xFF ^ encoding_vals[self.encoding]) self.nodeBuf = ByteBuffer() self.dataBuf = ByteBuffer() self.dataByteBuf = ByteBuffer(self.dataBuf.data) self.dataWordBuf = ByteBuffer(self.dataBuf.data) self._node_to_binary(self.xml_doc) # always has the isArray bit set self.nodeBuf.append_u8(xml_types['endSection'] | 64) self.nodeBuf.realign_writes() header.append_u32(len(self.nodeBuf)) self.nodeBuf.append_u32(len(self.dataBuf)) return bytes(header.data + self.nodeBuf.data + self.dataBuf.data) def from_binary(self, input): self.xml_doc = etree.Element('root') node = self.xml_doc self.nodeBuf = ByteBuffer(input) assert self.nodeBuf.get_u8() == SIGNATURE compress = self.nodeBuf.get_u8() assert compress in (SIG_COMPRESSED, SIG_UNCOMPRESSED) self.compressed = compress == SIG_COMPRESSED encoding_key = self.nodeBuf.get_u8() assert self.nodeBuf.get_u8() == 0xFF ^ encoding_key self.encoding = encoding_strings[encoding_key] nodeEnd = self.nodeBuf.get_u32() + 8 self.nodeBuf.end = nodeEnd self.dataBuf = ByteBuffer(input, nodeEnd) dataSize = self.dataBuf.get_u32() # This is all no fun self.dataByteBuf = ByteBuffer(input, nodeEnd) self.dataWordBuf = ByteBuffer(input, nodeEnd) nodesLeft = True while nodesLeft and self.nodeBuf.hasData(): while self.nodeBuf.peek_u8() == 0: debug_print("Skipping 0 node ID") self.nodeBuf.get_u8() nodeType = self.nodeBuf.get_u8() isArray = nodeType & 64 nodeType &= ~64 nodeFormat = xml_formats.get(nodeType, {'name':'Unknown'}) debug_print('Node type is {} ({})'.format(nodeFormat['name'], nodeType)) # node or attribute name name = '' if nodeType != xml_types['nodeEnd'] and nodeType != xml_types['endSection']: if self.compressed: name = unpack_sixbit(self.nodeBuf) else: length = self.nodeBuf.get_u8() name = self.nodeBuf.get('s', length) debug_print(name) skip = True if nodeType == xml_types['attr']: value = self.data_grab_string() node.attrib[name] = value elif nodeType == xml_types['nodeEnd']: if node.getparent() is not None: node = node.getparent() elif nodeType == xml_types['endSection']: nodesLeft = False elif nodeType not in xml_formats: raise NotImplementedError('Implement node {}'.format(nodeType)) else: # inner value to process skip = False if skip: continue child = etree.SubElement(node, name) node = child if nodeType == xml_types['nodeStart']: continue node.attrib['__type'] = nodeFormat['name'] varCount = nodeFormat['count'] arrayCount = 1 if varCount == -1: # the 2 cannot be combined varCount = self.dataBuf.get_u32() isArray = True elif isArray: arrayCount = self.dataBuf.get_u32() // (calcsize(nodeFormat['type'] * varCount)) node.attrib['__count'] = str(arrayCount) totalCount = arrayCount * varCount if isArray: data = self.dataBuf.get(nodeFormat['type'], totalCount) self.dataBuf.realign_reads() else: data = self.data_grab_aligned(nodeFormat['type'], totalCount) if nodeType == xml_types['binary']: node.attrib['__size'] = str(totalCount) string = ''.join(('{0:02x}'.format(x) for x in data)) elif nodeType == xml_types['string']: string = bytes(data[:-1]).decode(self.encoding) else: string = ' '.join(map(nodeFormat.get('toStr', str), data)) node.text = string # because we need the 'real' root self.xml_doc = self.xml_doc[0] if __name__ == '__main__': if len(sys.argv) != 2: print('bin_xml.py file.[xml/bin]') exit() with open(sys.argv[1], 'rb') as f: input = f.read() xml = KBinXML(input) if KBinXML.is_binary_xml(input): stdout.write(xml.to_text().encode('utf-8')) else: stdout.write(xml.to_binary())