mirror of
https://github.com/mon/kbinxml.git
synced 2026-03-21 18:04:52 -05:00
322 lines
11 KiB
Python
322 lines
11 KiB
Python
# python 3 style, ints instead of b''
|
|
from builtins import bytes
|
|
from struct import calcsize
|
|
import string
|
|
import sys
|
|
import operator
|
|
from io import BytesIO
|
|
|
|
import lxml.etree as etree
|
|
from bytebuffer import ByteBuffer
|
|
from sixbit import pack_sixbit, unpack_sixbit
|
|
from format_ids import xml_formats, xml_types
|
|
|
|
stdout = getattr(sys.stdout, 'buffer', sys.stdout)
|
|
|
|
DEBUG_OFFSETS = False
|
|
DEBUG = False
|
|
|
|
SIGNATURE = 0xA0
|
|
|
|
SIG_COMPRESSED = 0x42
|
|
SIG_UNCOMPRESSED = 0x45
|
|
|
|
XML_ENCODING = 'UTF-8'
|
|
BIN_ENCODING = 'SHIFT_JISX0213'
|
|
|
|
# NOTE: all of these are their python codec names
|
|
encoding_strings = {
|
|
0x20: 'ASCII',
|
|
0x00: 'ISO-8859-1',
|
|
0x60: 'EUC_JP',
|
|
0x80: 'SHIFT_JISX0213',
|
|
0xA0: 'UTF_8'
|
|
}
|
|
|
|
encoding_vals = {val : key for key, val in encoding_strings.items()}
|
|
|
|
def debug_print(string):
|
|
if DEBUG:
|
|
print(string)
|
|
|
|
class KBinXML():
|
|
|
|
def __init__(self, input):
|
|
if isinstance(input, etree._Element):
|
|
self.xml_doc = input
|
|
elif isinstance(input, etree._ElementTree):
|
|
self.xml_doc = input.getroot()
|
|
elif KBinXML.is_binary_xml(input):
|
|
self.from_binary(input)
|
|
else:
|
|
self.from_text(input)
|
|
|
|
def to_text(self):
|
|
# we decode again because I want unicode, dammit
|
|
return etree.tostring(self.xml_doc, pretty_print=True,
|
|
encoding=XML_ENCODING, xml_declaration=True).decode(XML_ENCODING)
|
|
|
|
def from_text(self, input):
|
|
self.xml_doc = etree.parse(BytesIO(input)).getroot()
|
|
|
|
@staticmethod
|
|
def is_binary_xml(input):
|
|
nodeBuf = ByteBuffer(input)
|
|
return (nodeBuf.get_u8() == SIGNATURE and
|
|
nodeBuf.get_u8() in (SIG_COMPRESSED, SIG_UNCOMPRESSED))
|
|
|
|
def data_grab_auto(self):
|
|
size = self.dataBuf.get_s32()
|
|
ret = self.dataBuf.get('B', size)
|
|
self.dataBuf.realign_reads()
|
|
return ret
|
|
|
|
def data_append_auto(self, data):
|
|
self.dataBuf.append_s32(len(data))
|
|
self.dataBuf.append(data, 'B', len(data))
|
|
self.dataBuf.realign_writes()
|
|
|
|
def data_grab_string(self):
|
|
data = self.data_grab_auto()
|
|
return bytes(data[:-1]).decode(self.encoding)
|
|
|
|
def data_append_string(self, string):
|
|
string = bytes(string.encode(self.encoding) + b'\0')
|
|
self.data_append_auto(string)
|
|
|
|
# has its own separate state and other assorted garbage
|
|
def data_grab_aligned(self, type, count):
|
|
if self.dataByteBuf.offset % 4 == 0:
|
|
self.dataByteBuf.offset = self.dataBuf.offset
|
|
if self.dataWordBuf.offset % 4 == 0:
|
|
self.dataWordBuf.offset = self.dataBuf.offset
|
|
# multiply by count since 2u2 reads from the 16 bit buffer, for example
|
|
size = calcsize(type) * count
|
|
if size == 1:
|
|
ret = self.dataByteBuf.get(type, count)
|
|
elif size == 2:
|
|
ret = self.dataWordBuf.get(type, count)
|
|
else:
|
|
ret = self.dataBuf.get(type, count)
|
|
self.dataBuf.realign_reads()
|
|
trailing = max(self.dataByteBuf.offset, self.dataWordBuf.offset)
|
|
if self.dataBuf.offset < trailing:
|
|
self.dataBuf.offset = trailing
|
|
self.dataBuf.realign_reads()
|
|
return ret
|
|
|
|
def data_append_aligned(self, data, type, count):
|
|
if self.dataByteBuf.offset % 4 == 0:
|
|
self.dataByteBuf.offset = self.dataBuf.offset
|
|
if self.dataWordBuf.offset % 4 == 0:
|
|
self.dataWordBuf.offset = self.dataBuf.offset
|
|
# multiply by count since 2u2 reads from the 16 bit buffer, for example
|
|
size = calcsize(type) * count
|
|
if size == 1:
|
|
# make room for our stuff if fresh dword
|
|
if self.dataByteBuf.offset % 4 == 0:
|
|
self.dataBuf.append_u32(0)
|
|
self.dataByteBuf.set(data, self.dataByteBuf.offset, type, count)
|
|
elif size == 2:
|
|
if self.dataWordBuf.offset % 4 == 0:
|
|
self.dataBuf.append_u32(0)
|
|
self.dataWordBuf.set(data, self.dataWordBuf.offset, type, count)
|
|
else:
|
|
self.dataBuf.append(data, type, count)
|
|
self.dataBuf.realign_writes()
|
|
|
|
def _node_to_binary(self, node):
|
|
nodeType = node.attrib.get('__type')
|
|
if not nodeType:
|
|
# typeless tags with text become string
|
|
if node.text is not None and len(node.text.strip()) > 0:
|
|
nodeType = 'str'
|
|
else:
|
|
nodeType = 'void'
|
|
nodeId = xml_types[nodeType]
|
|
|
|
isArray = 0
|
|
count = node.attrib.get('__count')
|
|
if count:
|
|
count = int(count)
|
|
isArray = 64 # bit position for array flag
|
|
|
|
self.nodeBuf.append_u8(nodeId | isArray)
|
|
|
|
name = node.tag
|
|
pack_sixbit(name, self.nodeBuf)
|
|
|
|
if nodeType != 'void':
|
|
fmt = xml_formats[nodeId]
|
|
|
|
val = node.text
|
|
if fmt['name'] == 'bin':
|
|
data = bytes(bytearray.fromhex(val))
|
|
elif fmt['name'] == 'str':
|
|
data = bytes(val.encode(self.encoding) + b'\0')
|
|
else:
|
|
val = val.split(' ')
|
|
data = list(map(fmt.get('fromStr', int), val))
|
|
if count and len(data) / fmt['count'] != count:
|
|
raise ValueError('Array length does not match __count attribute')
|
|
|
|
if isArray or fmt['count'] == -1:
|
|
self.dataBuf.append_u32(len(data) * calcsize(fmt['type']))
|
|
self.dataBuf.append(data, fmt['type'], len(data))
|
|
self.dataBuf.realign_writes()
|
|
else:
|
|
self.data_append_aligned(data, fmt['type'], fmt['count'])
|
|
|
|
# for test consistency and to be more faithful, sort the attrs
|
|
sorted_attrs = sorted(node.attrib.items(), key=operator.itemgetter(0))
|
|
for key, value in sorted_attrs:
|
|
if key not in ['__type', '__size', '__count']:
|
|
self.data_append_string(value)
|
|
self.nodeBuf.append_u8(xml_types['attr'])
|
|
pack_sixbit(key, self.nodeBuf)
|
|
|
|
for child in node.iterchildren(tag=etree.Element):
|
|
self._node_to_binary(child)
|
|
|
|
# always has the isArray bit set
|
|
self.nodeBuf.append_u8(xml_types['nodeEnd'] | 64)
|
|
|
|
def to_binary(self):
|
|
self.encoding = BIN_ENCODING
|
|
|
|
header = ByteBuffer()
|
|
header.append_u8(SIGNATURE)
|
|
header.append_u8(SIG_COMPRESSED)
|
|
header.append_u8(encoding_vals[self.encoding])
|
|
# Python's ints are big, so can't just bitwise invert
|
|
header.append_u8(0xFF ^ encoding_vals[self.encoding])
|
|
self.nodeBuf = ByteBuffer()
|
|
self.dataBuf = ByteBuffer()
|
|
self.dataByteBuf = ByteBuffer(self.dataBuf.data)
|
|
self.dataWordBuf = ByteBuffer(self.dataBuf.data)
|
|
|
|
self._node_to_binary(self.xml_doc)
|
|
|
|
# always has the isArray bit set
|
|
self.nodeBuf.append_u8(xml_types['endSection'] | 64)
|
|
self.nodeBuf.realign_writes()
|
|
header.append_u32(len(self.nodeBuf))
|
|
self.nodeBuf.append_u32(len(self.dataBuf))
|
|
return bytes(header.data + self.nodeBuf.data + self.dataBuf.data)
|
|
|
|
def from_binary(self, input):
|
|
self.xml_doc = etree.Element('root')
|
|
node = self.xml_doc
|
|
|
|
self.nodeBuf = ByteBuffer(input)
|
|
assert self.nodeBuf.get_u8() == SIGNATURE
|
|
|
|
compress = self.nodeBuf.get_u8()
|
|
assert compress in (SIG_COMPRESSED, SIG_UNCOMPRESSED)
|
|
self.compressed = compress == SIG_COMPRESSED
|
|
|
|
encoding_key = self.nodeBuf.get_u8()
|
|
assert self.nodeBuf.get_u8() == 0xFF ^ encoding_key
|
|
self.encoding = encoding_strings[encoding_key]
|
|
|
|
nodeEnd = self.nodeBuf.get_u32() + 8
|
|
self.nodeBuf.end = nodeEnd
|
|
|
|
self.dataBuf = ByteBuffer(input, nodeEnd)
|
|
dataSize = self.dataBuf.get_u32()
|
|
# This is all no fun
|
|
self.dataByteBuf = ByteBuffer(input, nodeEnd)
|
|
self.dataWordBuf = ByteBuffer(input, nodeEnd)
|
|
|
|
nodesLeft = True
|
|
while nodesLeft and self.nodeBuf.hasData():
|
|
while self.nodeBuf.peek_u8() == 0:
|
|
debug_print("Skipping 0 node ID")
|
|
self.nodeBuf.get_u8()
|
|
|
|
nodeType = self.nodeBuf.get_u8()
|
|
isArray = nodeType & 64
|
|
nodeType &= ~64
|
|
|
|
nodeFormat = xml_formats.get(nodeType, {'name':'Unknown'})
|
|
debug_print('Node type is {} ({})'.format(nodeFormat['name'], nodeType))
|
|
|
|
# node or attribute name
|
|
name = ''
|
|
if nodeType != xml_types['nodeEnd'] and nodeType != xml_types['endSection']:
|
|
if self.compressed:
|
|
name = unpack_sixbit(self.nodeBuf)
|
|
else:
|
|
length = self.nodeBuf.get_u8()
|
|
name = self.nodeBuf.get('s', length)
|
|
debug_print(name)
|
|
|
|
skip = True
|
|
|
|
if nodeType == xml_types['attr']:
|
|
value = self.data_grab_string()
|
|
node.attrib[name] = value
|
|
elif nodeType == xml_types['nodeEnd']:
|
|
if node.getparent() is not None:
|
|
node = node.getparent()
|
|
elif nodeType == xml_types['endSection']:
|
|
nodesLeft = False
|
|
elif nodeType not in xml_formats:
|
|
raise NotImplementedError('Implement node {}'.format(nodeType))
|
|
else: # inner value to process
|
|
skip = False
|
|
|
|
if skip:
|
|
continue
|
|
|
|
child = etree.SubElement(node, name)
|
|
node = child
|
|
|
|
if nodeType == xml_types['nodeStart']:
|
|
continue
|
|
|
|
node.attrib['__type'] = nodeFormat['name']
|
|
|
|
varCount = nodeFormat['count']
|
|
arrayCount = 1
|
|
if varCount == -1: # the 2 cannot be combined
|
|
varCount = self.dataBuf.get_u32()
|
|
isArray = True
|
|
elif isArray:
|
|
arrayCount = self.dataBuf.get_u32() // (calcsize(nodeFormat['type'] * varCount))
|
|
node.attrib['__count'] = str(arrayCount)
|
|
totalCount = arrayCount * varCount
|
|
|
|
if isArray:
|
|
data = self.dataBuf.get(nodeFormat['type'], totalCount)
|
|
self.dataBuf.realign_reads()
|
|
else:
|
|
data = self.data_grab_aligned(nodeFormat['type'], totalCount)
|
|
|
|
if nodeType == xml_types['binary']:
|
|
node.attrib['__size'] = str(totalCount)
|
|
string = ''.join(('{0:02x}'.format(x) for x in data))
|
|
elif nodeType == xml_types['string']:
|
|
string = bytes(data[:-1]).decode(self.encoding)
|
|
else:
|
|
string = ' '.join(map(nodeFormat.get('toStr', str), data))
|
|
|
|
node.text = string
|
|
|
|
# because we need the 'real' root
|
|
self.xml_doc = self.xml_doc[0]
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) != 2:
|
|
print('bin_xml.py file.[xml/bin]')
|
|
exit()
|
|
|
|
with open(sys.argv[1], 'rb') as f:
|
|
input = f.read()
|
|
|
|
xml = KBinXML(input)
|
|
if KBinXML.is_binary_xml(input):
|
|
stdout.write(xml.to_text().encode('utf-8'))
|
|
else:
|
|
stdout.write(xml.to_binary())
|