commit d1e9ab72bfef9c4b3c5cdc7578fec8cfe0453ff4
Author: William Toohey <will@mon.im>
Date:   Tue Jun 13 19:57:04 2017 +1000

    Initial commit

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0d20b64
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+*.pyc
diff --git a/bin_xml.py b/bin_xml.py
new file mode 100644
index 0000000..555d28a
--- /dev/null
+++ b/bin_xml.py
@@ -0,0 +1,325 @@
+from xml.dom import minidom
+from struct import calcsize
+import string
+from bitarray import bitarray
+from bytebuffer import ByteBuffer
+from format_ids import xml_formats, xml_types
+
+import IPython
+
+DEBUG_OFFSETS = False
+DEBUG = False
+
+SIGNATURE = 0xA042
+
+encodings = [
+    None,
+    'ASCII',
+    'ISO-8859-1',
+    'EUC-JP',
+    'SHIFT_JIS',
+    'UTF-8'
+]
+
+def debug_print(string):
+    if DEBUG:
+        print string
+
+def pack_bits(string, nodeBuf, bits = 6):
+    chars = str_to_sixbit(string)
+    bits = bitarray(endian='big')
+    for c in chars:
+        bits.frombytes(c)
+        del bits[-8:-6]
+    for c in bits.tobytes():
+        nodeBuf.append_u8(ord(c))
+
+def unpack_bits(bitArray, byteBuf, length, bits = 6):
+    result = []
+    offset = byteBuf.offset * 8
+    for i in range(length):
+        result.append(ord(bitArray[offset:offset+bits].tobytes()) >> (8 - bits))
+        offset += bits
+    # padding
+    byteBuf.offset += (length * bits + 7) // 8
+    return sixbit_to_str(result)
+
+# 0-9 for numbers, 10 to 36 for capitals, 37 for underscore, 38-63 for lowercase
+def sixbit_to_str(decompressed):
+    string = ''
+    for d in decompressed:
+        if d <= 10:
+            d += ord('0')
+        elif d < 37:
+            d += 54
+        elif d == 37:
+            d += 58
+        else:
+            d += 59
+        string += chr(d)
+    return string
+
+def str_to_sixbit(string):
+    compress = []
+    for c in string:
+        if c >= '0' and c <= '9':
+            compress.append(ord(c) - ord('0'))
+        elif c >= 'A' and c <= 'Z':
+            compress.append(ord(c) - 54)
+        elif c == '_':
+            compress.append(ord(c) - 58)
+        elif c >= 'a' and c <= 'z':
+            compress.append(ord(c) - 59)
+        else:
+            raise ValueError('Node name can only contain alphanumeric + underscore')
+    return ''.join(map(chr, compress))
+
+def data_grab_auto(dataBuf):
+    size = dataBuf.get_s32()
+    ret = [dataBuf.get_u8() for x in range(size)]
+    # padding
+    dataBuf.offset += 3
+    # round to dword
+    dataBuf.offset &= ~0b11
+    return ret
+
+def data_append_auto(dataBuf, data):
+    dataBuf.append_s32(len(data))
+    dataBuf.append(data, 's', len(data))
+
+    # padding
+    while len(dataBuf) % 4:
+        dataBuf.append_u8(0)
+
+def data_append_string(dataBuf, string):
+    string = string.encode('shift_jisx0213')
+    data_append_auto(dataBuf, string)
+
+def data_grab_string(dataBuf):
+    data = data_grab_auto(dataBuf)
+    res = ''
+    for b in data:
+        if b == 0:
+            break
+        res += chr(b)
+    return res.decode('shift_jisx0213')
+
+# has its own separate state and other assorted garbage
+def data_grab_aligned(dataBuf, dataByteBuf, dataWordBuf, type, count):
+    if dataByteBuf.offset % 4 == 0:
+        dataByteBuf.offset = dataBuf.offset
+    if dataWordBuf.offset % 4 == 0:
+        dataWordBuf.offset = dataBuf.offset
+    # multiply by count since 2u2 reads from the 16 bit buffer, for example
+    size = calcsize(type) * count
+    if size == 1:
+        ret = dataByteBuf.get(type, count)
+    elif size == 2:
+        ret = dataWordBuf.get(type, count)
+    else:
+        ret = dataBuf.get(type, count)
+    trailing = max(dataByteBuf.offset, dataWordBuf.offset)
+    if dataBuf.offset < trailing:
+        dataBuf.offset = trailing + 3
+        dataBuf.offset &= ~0b11
+    return ret
+
+def is_binary_xml(input):
+    nodeBuf = ByteBuffer(input)
+    return nodeBuf.get_u16() == SIGNATURE
+
+def _xml_node_to_binary(node, nodeBuf, dataBuf):
+    nodeType = node.getAttribute('__type')
+    if not nodeType:
+        nodeType = 'void'
+    nodeId = xml_types[nodeType]
+
+    isArray = 0
+    count = node.getAttribute('__count')
+    if count:
+        count = int(count)
+        isArray = 64 # bit position for array flag
+
+    nodeBuf.append_u8(nodeId | isArray)
+
+    name = node.nodeName
+    nodeBuf.append_u8(len(name))
+    pack_bits(name, nodeBuf)
+
+    import operator
+    sorted_x = sorted(node.attributes.items(), key=operator.itemgetter(0))
+    for key, value in sorted_x:#node.attributes.items():
+        if key in ['__type', '__size', '__count']:
+            pass
+        else:
+            data_append_string(dataBuf, value)
+            nodeBuf.append_u8(xml_types['attr'])
+            nodeBuf.append_u8(len(key))
+            pack_bits(key, nodeBuf)
+
+    if nodeType != 'void':
+        nodeId = xml_types[nodeType]
+        fmt = xml_formats[nodeId]
+
+        data = map(fmt['pType'], node.firstChild.nodeValue.split(fmt.get('delimiter', ' ')))
+
+        if fmt['count'] == -1 or not isArray:
+            data = data[0]
+        if isArray or fmt['count'] == -1:
+            dataBuf.append_u32(len(data))
+            if isArray:
+                for d in data:
+                    dataBuf.append(d, fmt['type'])
+            else:
+                dataBuf.append(data, fmt['type'])
+        else:
+            data_append_aligned(dataBuf, dataByteBuf, dataWordBuf, fmt['type'], fmt['count'])
+
+    for child in node.childNodes:
+        if child.nodeType != child.TEXT_NODE:
+            _xml_node_to_binary(child, nodeBuf, dataBuf)
+
+    nodeBuf.append_u8(xml_types['nodeEnd'] | 64)
+
+def xml_text_to_binary(input):
+    return xml_to_binary(minidom.parseString(input))
+
+def xml_to_binary(input):
+    header = ByteBuffer()
+    header.append_u16(SIGNATURE)
+    header.append_u8(4 << 5) # SHIFT-JIS TODO make encoding variable
+    header.append_u8(0x7F) # TODO what does this do as 7f or ff
+    nodeBuf = ByteBuffer()
+    dataBuf = ByteBuffer()
+
+    for child in input.childNodes:
+        _xml_node_to_binary(child, nodeBuf, dataBuf)
+
+    nodeBuf.append_u8(xml_types['endSection'] | 64)
+    while len(nodeBuf) % 4 != 0:
+        nodeBuf.append_u8(0)
+    header.append_u32(len(nodeBuf))
+    nodeBuf.append_u32(len(dataBuf))
+    return header.data + nodeBuf.data + dataBuf.data
+
+def binary_to_xml_text(input):
+    return binary_to_xml(input).toprettyxml(indent="    ", encoding='UTF-8')
+
+def binary_to_xml(input):
+    doc = minidom.Document()
+    node = doc
+
+    nodeBuf = ByteBuffer(input)
+    assert nodeBuf.get_u16() == SIGNATURE
+    encoding = encodings[(nodeBuf.get_u8() & 0xE0) >> 5]
+    unknown = nodeBuf.get_u8()
+
+    # creating bitarrays is slow, cache for speed
+    nodeBits = bitarray(endian='big')
+    nodeBits.frombytes(input)
+
+    nodeEnd = nodeBuf.get_u32() + 8
+    nodeBuf.end = nodeEnd
+
+    dataBuf = ByteBuffer(input, nodeEnd)
+    dataSize = dataBuf.get_u32()
+    # WHY MUST YOU DO THIS TO ME
+    dataByteBuf = ByteBuffer(input, nodeEnd)
+    dataWordBuf = ByteBuffer(input, nodeEnd)
+
+    nodesLeft = True
+    while nodesLeft and nodeBuf.hasData():
+        while nodeBuf.peek_u8() == 0:
+            debug_print("Skipping 0 node ID")
+            nodeBuf.get_u8()
+
+        nodeType = nodeBuf.get_u8()
+        isArray = nodeType & 64
+        nodeType &= ~64
+
+        nodeFormat = xml_formats.get(nodeType, {'name':'Unknown'})
+        debug_print('Node type is {} ({})'.format(nodeFormat['name'], nodeType))
+
+        # node name
+        name = ''
+        if nodeType != xml_types['nodeEnd'] and nodeType != xml_types['endSection']:
+            strLen = nodeBuf.get_u8()
+            name = unpack_bits(nodeBits, nodeBuf, strLen)
+            debug_print(name)
+
+        skip = True
+
+        if nodeType == xml_types['attr']:
+            value = data_grab_string(dataBuf)
+            node.setAttribute(name, value)
+        elif nodeType == xml_types['nodeEnd']:
+            if node.parentNode:
+                node = node.parentNode
+        elif nodeType == xml_types['endSection']:
+            nodesLeft = False
+        elif nodeType not in xml_formats:
+            raise NotImplementedError('Implement node {}'.format(nodeType))
+        else: # inner value to process
+            skip = False
+
+        if skip:
+            continue
+
+        child = doc.createElement(name)
+        node.appendChild(child)
+        node = child
+
+        if nodeType == xml_types['nodeStart']:
+            continue
+
+        node.setAttribute('__type', nodeFormat['name'])
+
+        if isArray:
+            arrayCount = dataBuf.get_u32()
+            node.setAttribute('__count', str(arrayCount))
+        else:
+             arrayCount = 1
+        varCount = nodeFormat['count']
+        if varCount == -1:
+            varCount = dataBuf.get_u32()
+        totalCount = arrayCount * varCount
+
+        delim = nodeFormat.get('delimiter', ' ')
+
+        if isArray or nodeFormat['count'] == -1:
+            try:
+                data = dataBuf.get(nodeFormat['type'], totalCount)
+            except:
+                print doc.toprettyxml(indent="  ", encoding='UTF-8')
+                IPython.embed()
+            dataBuf.offset += 3 # padding
+            dataBuf.offset &= ~0b11 # align to dword
+        else:
+            data = data_grab_aligned(dataBuf, dataByteBuf, dataWordBuf, nodeFormat['type'], totalCount)
+        string = delim.join(map(str, data))
+
+        if nodeType == xml_types['binary']:
+            node.setAttribute('__size', str(totalCount))
+            string = ''.join(('{0:02x}'.format(ord(x)) for x in string))
+        if nodeType == xml_types['string']:
+            string = string[:-1].decode('shift_jisx0213')
+
+        node.appendChild(doc.createTextNode(string))
+
+        #print doc.toprettyxml(indent="  ", encoding='UTF-8')
+    return doc
+
+
+if __name__ == '__main__':
+    #input = open('./dump/_core_model=KFC_J_A_A_2016121200_module=package_method=list_out.raw','rb').read()
+    #input = open('./dump/KFCmodelKFCJAA2016121200modulegame3methodcommon.raw','rb').read()
+    input = open('test.raw', 'rb').read()
+    xml = binary_to_xml(input)
+    binary = xml_to_binary(xml)
+    with open('out.raw', 'wb') as f:
+        f.write(binary)
+
+    #print [ord(x) for x in input]
+    #print [ord(x) for x in binary]
+    #print binary_to_xml_text(input)
+    print binary_to_xml_text(binary)
diff --git a/bytebuffer.py b/bytebuffer.py
new file mode 100644
index 0000000..386c11c
--- /dev/null
+++ b/bytebuffer.py
@@ -0,0 +1,71 @@
+from struct import *
+
+class ByteBuffer():
+    def __init__(self, input = b'', offset = 0, endian = '>'):
+        self.data = input
+        self.endian = endian
+        self.offset = offset
+        self.end = len(self.data)
+
+    def get(self, type, count = None):
+        ret = self.peek(type, count)
+        size = calcsize(type)
+        if count is not None:
+            size *= count
+        self.offset += size
+        return ret
+
+    def peek(self, type, count = None):
+        if count is None:
+            fmt = self.endian + type
+        else:
+            fmt = self.endian + str(count) + type
+        ret = unpack(fmt, self.data[self.offset:self.offset+calcsize(fmt)])
+        return ret[0] if count is None else ret
+
+    def append(self, data, type, count = 1):
+        if count is None:
+            fmt = self.endian + type
+        else:
+            fmt = self.endian + str(count) + type
+        self.data += pack(fmt, data)
+
+    def hasData(self):
+        return self.offset < self.end
+
+    def __len__(self):
+        return len(self.data)
+
+typeMap = {
+    's8'  : 'b',
+    's16' : 'h',
+    's32' : 'i',
+    's64' : 'q',
+    'u8'  : 'B',
+    'u16' : 'H',
+    'u32' : 'I',
+    'u64' : 'Q'
+}
+
+def _make_get(fmt):
+    def _method(self):
+        return self.get(fmt)
+    return _method
+
+def _make_peek(fmt):
+    def _method(self):
+        return self.peek(fmt)
+    return _method
+
+def _make_append(fmt):
+    def _method(self, data):
+        return self.append(data, fmt)
+    return _method
+
+for name, fmt in typeMap.iteritems():
+    _get = _make_get(fmt)
+    _peek = _make_peek(fmt)
+    _append = _make_append(fmt)
+    setattr(ByteBuffer, 'get_' + name, _get)
+    setattr(ByteBuffer, 'peek_' + name, _peek)
+    setattr(ByteBuffer, 'append_' + name, _append)
diff --git a/format_ids.py b/format_ids.py
new file mode 100644
index 0000000..4c14806
--- /dev/null
+++ b/format_ids.py
@@ -0,0 +1,74 @@
+
+def jisString(string):
+    return string.encode('shift_jisx0213')
+
+xml_formats = {
+    1  : { 'type' : None, 'count' : None, 'pType' : None,  'names' : ['void']},
+    2  : { 'type' : 'b',  'count' : 1,    'pType' : int,   'names' : ['s8']},
+    3  : { 'type' : 'B',  'count' : 1,    'pType' : int,   'names' : ['u8']},
+    4  : { 'type' : 'h',  'count' : 1,    'pType' : int,   'names' : ['s16']},
+    5  : { 'type' : 'H',  'count' : 1,    'pType' : int,   'names' : ['u16']},
+    6  : { 'type' : 'i',  'count' : 1,    'pType' : int,   'names' : ['s32']},
+    7  : { 'type' : 'I',  'count' : 1,    'pType' : int,   'names' : ['u32']},
+    8  : { 'type' : 'q',  'count' : 1,    'pType' : int,   'names' : ['s64']},
+    9  : { 'type' : 'Q',  'count' : 1,    'pType' : int,   'names' : ['u64']},
+    10 : { 'type' : 'c',  'count' : -1,   'pType' : bytearray.fromhex, 'names' : ['bin', 'binary'], 'delimiter' : ''},
+    11 : { 'type' : 's',  'count' : -1,   'pType' : jisString, 'names' : ['str', 'string'], 'delimiter' : ''},
+    12 : { 'type' : 'B',  'count' : 4,    'pType' : int,   'names' : ['ip4'], 'delimiter' : '.'},
+    13 : { 'type' : 'I',  'count' : 1,    'pType' : int,   'names' : ['time']}, # todo: how to print
+    14 : { 'type' : 'f',  'count' : 1,    'pType' : float, 'names' : ['float', 'f']},
+    15 : { 'type' : 'd',  'count' : 1,    'pType' : float, 'names' : ['double', 'd']},
+    16 : { 'type' : 'b',  'count' : 2,    'pType' : int,   'names' : ['2s8']},
+    17 : { 'type' : 'B',  'count' : 2,    'pType' : int,   'names' : ['2u8']},
+    18 : { 'type' : 'h',  'count' : 2,    'pType' : int,   'names' : ['2s16']},
+    19 : { 'type' : 'H',  'count' : 2,    'pType' : int,   'names' : ['2u16']},
+    20 : { 'type' : 'i',  'count' : 2,    'pType' : int,   'names' : ['2s32']},
+    21 : { 'type' : 'I',  'count' : 2,    'pType' : int,   'names' : ['2u32']},
+    22 : { 'type' : 'q',  'count' : 2,    'pType' : int,   'names' : ['2s64', 'vs64']},
+    23 : { 'type' : 'Q',  'count' : 2,    'pType' : int,   'names' : ['2u64', 'vu64']},
+    24 : { 'type' : 'f',  'count' : 2,    'pType' : float, 'names' : ['2f']},
+    25 : { 'type' : 'd',  'count' : 2,    'pType' : float, 'names' : ['2d', 'vd']},
+    26 : { 'type' : 'b',  'count' : 3,    'pType' : int,   'names' : ['3s8']},
+    27 : { 'type' : 'B',  'count' : 3,    'pType' : int,   'names' : ['3u8']},
+    28 : { 'type' : 'h',  'count' : 3,    'pType' : int,   'names' : ['3s16']},
+    29 : { 'type' : 'H',  'count' : 3,    'pType' : int,   'names' : ['3u16']},
+    30 : { 'type' : 'i',  'count' : 3,    'pType' : int,   'names' : ['3s32']},
+    31 : { 'type' : 'I',  'count' : 3,    'pType' : int,   'names' : ['3u32']},
+    32 : { 'type' : 'q',  'count' : 3,    'pType' : int,   'names' : ['3s64']},
+    33 : { 'type' : 'Q',  'count' : 3,    'pType' : int,   'names' : ['3u64']},
+    34 : { 'type' : 'f',  'count' : 3,    'pType' : float, 'names' : ['3f']},
+    35 : { 'type' : 'd',  'count' : 3,    'pType' : float, 'names' : ['3d']},
+    36 : { 'type' : 'b',  'count' : 4,    'pType' : int,   'names' : ['4s8']},
+    37 : { 'type' : 'B',  'count' : 4,    'pType' : int,   'names' : ['4u8']},
+    38 : { 'type' : 'h',  'count' : 4,    'pType' : int,   'names' : ['4s16']},
+    39 : { 'type' : 'H',  'count' : 4,    'pType' : int,   'names' : ['4u16']},
+    40 : { 'type' : 'i',  'count' : 4,    'pType' : int,   'names' : ['4s32', 'vs32']},
+    41 : { 'type' : 'I',  'count' : 4,    'pType' : int,   'names' : ['4u32', 'vu32']},
+    42 : { 'type' : 'q',  'count' : 4,    'pType' : int,   'names' : ['4s64']},
+    43 : { 'type' : 'Q',  'count' : 4,    'pType' : int,   'names' : ['4u64']},
+    44 : { 'type' : 'f',  'count' : 4,    'pType' : float, 'names' : ['4f', 'vf']},
+    45 : { 'type' : 'd',  'count' : 4,    'pType' : float, 'names' : ['4d']},
+    46 : { 'type' : None, 'count' : None, 'pType' : None,  'names' : ['attr']},
+    #47 : { 'type' : None, 'count' : None, 'pType' : None,  'names' : ['array']},
+    48 : { 'type' : 'b',  'count' : 16,   'pType' : int,   'names' : ['vs8']},
+    49 : { 'type' : 'B',  'count' : 16,   'pType' : int,   'names' : ['vu8']},
+    50 : { 'type' : 'h',  'count' : 8,    'pType' : int,   'names' : ['vs16']},
+    51 : { 'type' : 'H',  'count' : 8,    'pType' : int,   'names' : ['vu16']},
+    52 : { 'type' : 'b',  'count' : 1,    'pType' : int,   'names' : ['bool', 'b']},
+    53 : { 'type' : 'b',  'count' : 2,    'pType' : int,   'names' : ['2b']},
+    54 : { 'type' : 'b',  'count' : 3,    'pType' : int,   'names' : ['3b']},
+    55 : { 'type' : 'b',  'count' : 4,    'pType' : int,   'names' : ['4b']},
+    56 : { 'type' : 'b',  'count' : 16,   'pType' : int,   'names' : ['vb']}
+}
+
+# little less boilerplate for writing
+for key, val in xml_formats.iteritems():
+    xml_formats[key]['name'] = xml_formats[key]['names'][0]
+
+xml_types = {}
+for key, val in xml_formats.iteritems():
+    for n in val['names']:
+        xml_types[n] = key
+xml_types['nodeStart'] = 1
+xml_types['nodeEnd'] = 190
+xml_types['endSection'] = 191