kbinxml/bin_xml.py

from xml.dom import minidom
from struct import calcsize
import string
from bitarray import bitarray
from bytebuffer import ByteBuffer
from format_ids import xml_formats, xml_types
import sys

DEBUG_OFFSETS = False
DEBUG = False

SIGNATURE = 0xA042

encodings = [
    None,
    'ASCII',
    'ISO-8859-1',
    'EUC-JP',
    'SHIFT_JIS',
    'UTF-8'
]

def debug_print(string):
    if DEBUG:
        print string

class kbinxml():

    def __init__(self, input):
        if isinstance(input, minidom.Document):
            self.xml_doc = input
        elif self.is_binary_xml(input):
            self.from_binary(input)
        else:
            self.from_text(input)

    def pack_bits(self, string, bits = 6):
        chars = self.str_to_sixbit(string)
        bits = bitarray(endian='big')
        for c in chars:
            bits.frombytes(c)
            del bits[-8:-6]
        for c in bits.tobytes():
            self.nodeBuf.append_u8(ord(c))

    def unpack_bits(self, length, bits = 6):
        result = []
        offset = self.nodeBuf.offset * 8
        for i in range(length):
            result.append(ord(self.nodeBits[offset:offset+bits].tobytes()) >> (8 - bits))
            offset += bits
        # padding
        self.nodeBuf.offset += (length * bits + 7) // 8
        return self.sixbit_to_str(result)

    # 0-9 for numbers, 10 to 36 for capitals, 37 for underscore, 38-63 for lowercase
    def sixbit_to_str(self, decompressed):
        string = ''
        for d in decompressed:
            if d <= 10:
                d += ord('0')
            elif d < 37:
                d += 54
            elif d == 37:
                d += 58
            else:
                d += 59
            string += chr(d)
        return string

    def str_to_sixbit(self, string):
        compress = []
        for c in string:
            if c >= '0' and c <= '9':
                compress.append(ord(c) - ord('0'))
            elif c >= 'A' and c <= 'Z':
                compress.append(ord(c) - 54)
            elif c == '_':
                compress.append(ord(c) - 58)
            elif c >= 'a' and c <= 'z':
                compress.append(ord(c) - 59)
            else:
                raise ValueError('Node name can only contain alphanumeric + underscore')
        return ''.join(map(chr, compress))

    def data_grab_auto(self):
        size = self.dataBuf.get_s32()
        ret = [self.dataBuf.get_u8() for x in range(size)]
        # padding
        self.dataBuf.offset += 3
        # round to dword
        self.dataBuf.offset &= ~0b11
        return ret

    def data_append_auto(self, data):
        self.dataBuf.append_s32(len(data))
        self.dataBuf.append(data, 's', len(data))

        # padding
        while len(self.dataBuf) % 4:
            self.dataBuf.append_u8(0)

    def data_append_string(self, string):
        string = string.encode('shift_jisx0213') + '\0'
        self.data_append_auto(string)

    def data_grab_string(self):
        data = self.data_grab_auto()
        res = ''
        for b in data:
            if b == 0:
                break
            res += chr(b)
        return res.decode('shift_jisx0213')

    # has its own separate state and other assorted garbage
    def data_grab_aligned(self, type, count):
        if self.dataByteBuf.offset % 4 == 0:
            self.dataByteBuf.offset = self.dataBuf.offset
        if self.dataWordBuf.offset % 4 == 0:
            self.dataWordBuf.offset = self.dataBuf.offset
        # multiply by count since 2u2 reads from the 16 bit buffer, for example
        size = calcsize(type) * count
        if size == 1:
            ret = self.dataByteBuf.get(type, count)
        elif size == 2:
            ret = self.dataWordBuf.get(type, count)
        else:
            ret = self.dataBuf.get(type, count)
        trailing = max(self.dataByteBuf.offset, self.dataWordBuf.offset)
        if self.dataBuf.offset < trailing:
            self.dataBuf.offset = trailing + 3
            self.dataBuf.offset &= ~0b11
        return ret

    def data_append_aligned(self, data, type, count):
        if self.dataByteBuf.offset % 4 == 0:
            self.dataByteBuf.offset = self.dataBuf.offset
        if self.dataWordBuf.offset % 4 == 0:
            self.dataWordBuf.offset = self.dataBuf.offset
        # multiply by count since 2u2 reads from the 16 bit buffer, for example
        size = calcsize(type) * count
        if size == 1:
            # make room if fresh dword for our stuff
            if self.dataByteBuf.offset % 4 == 0:
                self.dataBuf.append_u32(0)
            self.dataByteBuf.set(data, self.dataByteBuf.offset, type, count)
        elif size == 2:
            if self.dataWordBuf.offset % 4 == 0:
                self.dataBuf.append_u32(0)
            self.dataWordBuf.set(data, self.dataWordBuf.offset, type, count)
        else:
            self.dataBuf.append(data, type, count)

    def is_binary_xml(self, input):
        nodeBuf = ByteBuffer(input)
        return nodeBuf.get_u16() == SIGNATURE

    def _node_to_binary(self, node):
        nodeType = node.getAttribute('__type')
        if not nodeType:
            nodeType = 'void'
        nodeId = xml_types[nodeType]

        isArray = 0
        count = node.getAttribute('__count')
        if count:
            count = int(count)
            isArray = 64 # bit position for array flag

        self.nodeBuf.append_u8(nodeId | isArray)

        name = node.nodeName
        self.nodeBuf.append_u8(len(name))
        self.pack_bits(name)

        import operator
        sorted_x = sorted(node.attributes.items(), key=operator.itemgetter(0))
        for key, value in sorted_x:#node.attributes.items():
            if key in ['__type', '__size', '__count']:
                pass
            else:
                self.data_append_string(value)
                self.nodeBuf.append_u8(xml_types['attr'])
                self.nodeBuf.append_u8(len(key))
                self.pack_bits(key)

        if nodeType != 'void':
            fmt = xml_formats[nodeId]

            val = node.firstChild.nodeValue
            if fmt['count'] != -1:
                val = val.split(fmt.get('delimiter', ' '))
                data = map(fmt['pType'], val)
            else:
                data = fmt['pType'](val)

            if isArray or fmt['count'] == -1:
                self.dataBuf.append_u32(len(data) * calcsize(fmt['type']))
                self.dataBuf.append(data, fmt['type'], len(data))
                # padding
                while len(self.dataBuf) % 4:
                    self.dataBuf.append_u8(0)
            else:
                self.data_append_aligned(data, fmt['type'], fmt['count'])

        for child in node.childNodes:
            if child.nodeType != child.TEXT_NODE:
                self._node_to_binary(child)

        self.nodeBuf.append_u8(xml_types['nodeEnd'] | 64)

    def from_text(self, input):
        self.xml_doc = minidom.parseString(input)

    def to_binary(self):
        header = ByteBuffer()
        header.append_u16(SIGNATURE)
        header.append_u8(4 << 5) # SHIFT-JIS TODO make encoding variable
        header.append_u8(0x7F) # TODO what does this do as 7f or ff
        self.nodeBuf = ByteBuffer()
        self.dataBuf = ByteBuffer()
        self.dataByteBuf = ByteBuffer(self.dataBuf.data)
        self.dataWordBuf = ByteBuffer(self.dataBuf.data)

        for child in self.xml_doc.childNodes:
            self._node_to_binary(child)

        self.nodeBuf.append_u8(xml_types['endSection'] | 64)
        while len(self.nodeBuf) % 4 != 0:
            self.nodeBuf.append_u8(0)
        header.append_u32(len(self.nodeBuf))
        self.nodeBuf.append_u32(len(self.dataBuf))
        return bytes(header.data + self.nodeBuf.data + self.dataBuf.data)

    def to_text(self):
        return self.xml_doc.toprettyxml(indent="    ", encoding='UTF-8')

    def from_binary(self, input):
        self.xml_doc = minidom.Document()
        node = self.xml_doc

        self.nodeBuf = ByteBuffer(input)
        assert self.nodeBuf.get_u16() == SIGNATURE
        encoding = encodings[(self.nodeBuf.get_u8() & 0xE0) >> 5]
        unknown = self.nodeBuf.get_u8()

        # creating bitarrays is slow, cache for speed
        self.nodeBits = bitarray(endian='big')
        self.nodeBits.frombytes(input)

        nodeEnd = self.nodeBuf.get_u32() + 8
        self.nodeBuf.end = nodeEnd

        self.dataBuf = ByteBuffer(input, nodeEnd)
        dataSize = self.dataBuf.get_u32()
        # WHY MUST YOU DO THIS TO ME
        self.dataByteBuf = ByteBuffer(input, nodeEnd)
        self.dataWordBuf = ByteBuffer(input, nodeEnd)

        nodesLeft = True
        while nodesLeft and self.nodeBuf.hasData():
            while self.nodeBuf.peek_u8() == 0:
                debug_print("Skipping 0 node ID")
                self.nodeBuf.get_u8()

            nodeType = self.nodeBuf.get_u8()
            isArray = nodeType & 64
            nodeType &= ~64

            nodeFormat = xml_formats.get(nodeType, {'name':'Unknown'})
            debug_print('Node type is {} ({})'.format(nodeFormat['name'], nodeType))

            # node name
            name = ''
            if nodeType != xml_types['nodeEnd'] and nodeType != xml_types['endSection']:
                strLen = self.nodeBuf.get_u8()
                name = self.unpack_bits(strLen)
                debug_print(name)

            skip = True

            if nodeType == xml_types['attr']:
                value = self.data_grab_string()
                node.setAttribute(name, value)
            elif nodeType == xml_types['nodeEnd']:
                if node.parentNode:
                    node = node.parentNode
            elif nodeType == xml_types['endSection']:
                nodesLeft = False
            elif nodeType not in xml_formats:
                raise NotImplementedError('Implement node {}'.format(nodeType))
            else: # inner value to process
                skip = False

            if skip:
                continue

            child = self.xml_doc.createElement(name)
            node.appendChild(child)
            node = child

            if nodeType == xml_types['nodeStart']:
                continue

            node.setAttribute('__type', nodeFormat['name'])

            if isArray:
                arrayCount = self.dataBuf.get_u32() / calcsize(nodeFormat['type'])
                node.setAttribute('__count', str(arrayCount))
            else:
                 arrayCount = 1
            varCount = nodeFormat['count']
            if varCount == -1:
                varCount = self.dataBuf.get_u32()
            totalCount = arrayCount * varCount

            delim = nodeFormat.get('delimiter', ' ')

            if isArray or nodeFormat['count'] == -1:
                data = self.dataBuf.get(nodeFormat['type'], totalCount)
                self.dataBuf.offset += 3 # padding
                self.dataBuf.offset &= ~0b11 # align to dword
            else:
                data = self.data_grab_aligned(nodeFormat['type'], totalCount)
            string = delim.join(map(str, data))

            if nodeType == xml_types['binary']:
                node.setAttribute('__size', str(totalCount))
                string = ''.join(('{0:02x}'.format(ord(x)) for x in string))
            if nodeType == xml_types['string']:
                string = string[:-1].decode('shift_jisx0213')

            node.appendChild(self.xml_doc.createTextNode(string))

            #print self.xml_doc.toprettyxml(indent="  ", encoding='UTF-8')

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print 'bin_xml.py file1 [file2 ...]'

    # by default, confirm the implementation is correct
    for f in sys.argv[1:]:
        with open(f, 'rb') as f:
            input = f.read()
        xml = kbinxml(input)
        print xml.to_text()
        try:
            # just politely ignore the signature since we don't do encoding yet
            assert xml.to_binary()[4:] == input[4:]
        except AssertionError:
            print 'Files do not match!'
            with open('out.raw', 'wb') as f:
                f.write(xml.to_binary())