Fix BrokenPipeError when piping kbinxml output

Remove py2 support, format code, add type annotations, convert illegal node names
Merge branch 'master' of https://github.com/mon/kbinxml
2026-04-17 06:46:01 -05:00 · 2024-04-28 10:54:05 +10:00 · 2023-10-26 18:02:31 +10:00 · 2023-07-13 21:09:24 +10:00 · 2023-07-13 21:09:15 +10:00 · 2023-06-03 11:22:20 +10:00
13 changed files with 485 additions and 241 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 *.pyc
 dist/
 kbinxml.egg-info/
 build/
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,3 @@
 {
    "editor.formatOnSave": true
 }
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2018 mon
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/README.md
+++ b/README.md
@ -3,8 +3,11 @@
 An encoder/decoder for Konami's binary XML format, used in some of their games.
 ### Setup:
-`pip install .`
+`pip install kbinxml`
 You can use `kbinxml` from the commandline to convert files.
 Python usage:  
 ```python
 In [1]: from kbinxml import KBinXML
 In [2]: text = KBinXML('<?xml version="1.0"?><root __type="str">Hello, world!</root>')
@ -14,6 +17,4 @@ Out[4]: b'\xa0B\x80\x7f\x00\x00\x00\x08\x0b\x04\xdfM9\xfe\xff\x00\x00\x00\x00\x1
 In [5]: bin = KBinXML(Out[4])
 In [6]: bin.to_text()
 Out[7]: u'<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<root __type="str">Hello, world!</root>\n'
-```
+```
 You can also use `kbinxml` from the commandline to convert files.
--- a/kbinxml/bytebuffer.py
+++ b/kbinxml/bytebuffer.py
@ -1,26 +1,33 @@
 from struct import *
 from typing import Any
-class ByteBuffer():
+
-    def __init__(self, input = b'', offset = 0, endian = '>'):
+class ByteBuffer:
    def __init__(self, input: bytes | bytearray | str = b"", offset=0, endian=">"):
        # so multiple ByteBuffers can hold on to one set of underlying data
        # this is useful for writers in multiple locations
        if isinstance(input, bytearray):
            self.data = input
        else:
            if not isinstance(input, bytes):
-                input = input.encode('utf-8')
+                input = input.encode("utf-8")
            self.data = bytearray(input)
        self.endian = endian
        self.offset = offset
        self.end = len(self.data)
-    def _format_type(self, type, count):
+    def _format_type(self, type: str, count: int | None = None):
        if count is None:
            return self.endian + type
        else:
            return self.endian + str(count) + type
-    def get(self, type, count = None):
+    def get_bytes(self, count: int):
        start = self.offset
        self.offset += count
        return self.data[start : self.offset]
    def get(self, type: str, count: int | None = None):
        ret = self.peek(type, count)
        size = calcsize(type)
        if count is not None:
@ -28,12 +35,16 @@ class ByteBuffer():
        self.offset += size
        return ret
-    def peek(self, type, count = None):
+    def peek(self, type: str, count: int | None = None):
        fmt = self._format_type(type, count)
        ret = unpack_from(fmt, self.data, self.offset)
        return ret[0] if count is None else ret
-    def append(self, data, type, count = None):
+    def append_bytes(self, data: bytes):
        self.data.extend(data)
        self.offset += len(data)
    def append(self, data: Any, type: str, count: int | None = None):
        fmt = self._format_type(type, count)
        self.offset += calcsize(fmt)
        try:
@ -41,7 +52,7 @@ class ByteBuffer():
        except TypeError:
            self.data.extend(pack(fmt, data))
-    def set(self, data, offset, type, count = None):
+    def set(self, data: Any, offset: int, type: str, count: int | None = None):
        fmt = self._format_type(type, count)
        try:
            pack_into(fmt, self.data, offset, *data)
@ -52,54 +63,109 @@ class ByteBuffer():
    def hasData(self):
        return self.offset < self.end
-    def realign_writes(self, size = 4):
+    def realign_writes(self, size=4):
        while len(self) % size:
            self.append_u8(0)
-    def realign_reads(self, size = 4):
+    def realign_reads(self, size=4):
        while self.offset % size:
            self.offset += 1
    def __len__(self):
        return len(self.data)
-typeMap = {
+    def get_s8(self) -> int:
-    's8'  : 'b',
+        return self.get("b")
    's16' : 'h',
    's32' : 'i',
    's64' : 'q',
    'u8'  : 'B',
    'u16' : 'H',
    'u32' : 'I',
    'u64' : 'Q'
 }
-def _make_get(fmt):
+    def peek_s8(self) -> int:
-    def _method(self):
+        return self.peek("b")
        return self.get(fmt)
    return _method
-def _make_peek(fmt):
+    def append_s8(self, data: int):
-    def _method(self):
+        return self.append(data, "b")
        return self.peek(fmt)
    return _method
-def _make_append(fmt):
+    def set_s8(self, data: int, offset: int):
-    def _method(self, data):
+        return self.set(data, offset, "b")
        return self.append(data, fmt)
    return _method
-def _make_set(fmt):
+    def get_s16(self) -> int:
-    def _method(self, data, offset):
+        return self.get("h")
        return self.set(data, offset, fmt)
    return _method
-for name, fmt in typeMap.items():
+    def peek_s16(self) -> int:
-    _get = _make_get(fmt)
+        return self.peek("h")
-    _peek = _make_peek(fmt)
+
-    _append = _make_append(fmt)
+    def append_s16(self, data: int):
-    _set = _make_set(fmt)
+        return self.append(data, "h")
-    setattr(ByteBuffer, 'get_' + name, _get)
+
-    setattr(ByteBuffer, 'peek_' + name, _peek)
+    def set_s16(self, data: int, offset: int):
-    setattr(ByteBuffer, 'append_' + name, _append)
+        return self.set(data, offset, "h")
-    setattr(ByteBuffer, 'set_' + name, _set)
+
    def get_s32(self) -> int:
        return self.get("i")
    def peek_s32(self) -> int:
        return self.peek("i")
    def append_s32(self, data: int):
        return self.append(data, "i")
    def set_s32(self, data: int, offset: int):
        return self.set(data, offset, "i")
    def get_s64(self) -> int:
        return self.get("q")
    def peek_s64(self) -> int:
        return self.peek("q")
    def append_s64(self, data: int):
        return self.append(data, "q")
    def set_s64(self, data: int, offset: int):
        return self.set(data, offset, "q")
    def get_u8(self) -> int:
        return self.get("B")
    def peek_u8(self) -> int:
        return self.peek("B")
    def append_u8(self, data: int):
        return self.append(data, "B")
    def set_u8(self, data: int, offset: int):
        return self.set(data, offset, "B")
    def get_u16(self) -> int:
        return self.get("H")
    def peek_u16(self) -> int:
        return self.peek("H")
    def append_u16(self, data: int):
        return self.append(data, "H")
    def set_u16(self, data: int, offset: int):
        return self.set(data, offset, "H")
    def get_u32(self) -> int:
        return self.get("I")
    def peek_u32(self) -> int:
        return self.peek("I")
    def append_u32(self, data: int):
        return self.append(data, "I")
    def set_u32(self, data: int, offset: int):
        return self.set(data, offset, "I")
    def get_u64(self) -> int:
        return self.get("Q")
    def peek_u64(self) -> int:
        return self.peek("Q")
    def append_u64(self, data: int):
        return self.append(data, "Q")
    def set_u64(self, data: int, offset: int):
        return self.set(data, offset, "Q")
--- a/kbinxml/format_ids.py
+++ b/kbinxml/format_ids.py
@ -1,22 +1,27 @@
 from struct import pack, unpack
 def parseIP(string):
    bunch = map(int, string.split('.'))
    # pack to bytes
    p = pack('4B', *bunch)
    # unpack as u16
    return unpack('>I', p)[0]
-def writeIP(raw):
+def parseIP(string: str) -> int:
    bunch = map(int, string.split("."))
    # pack to bytes
-    p = pack('>I', raw)
+    p = pack("4B", *bunch)
    # unpack as u32
    return unpack(">I", p)[0]
 def writeIP(raw: int):
    # pack to bytes
    p = pack(">I", raw)
    # unpack
-    return '.'.join(map(str, unpack('4B', p)))
+    return ".".join(map(str, unpack("4B", p)))
-def writeFloat(raw):
+
 def writeFloat(raw: float):
    # this is just how floats get printed...
-    return '{0:.6f}'.format(raw)
+    return f"{raw:.6f}"
 # fmt: off
 xml_formats = {
    1  : { 'names' : ['void']},
    2  : { 'type' : 'b',  'count' : 1,  'names' : ['s8']},
@ -75,15 +80,16 @@ xml_formats = {
    55 : { 'type' : 'b',  'count' : 4,  'names' : ['4b']},
    56 : { 'type' : 'b',  'count' : 16, 'names' : ['vb']}
 }
 # fmt: on
 # little less boilerplate for writing
 for key, val in xml_formats.items():
-    xml_formats[key]['name'] = xml_formats[key]['names'][0]
+    xml_formats[key]["name"] = xml_formats[key]["names"][0]
-xml_types = {}
+xml_types: dict[str, int] = {}
 for key, val in xml_formats.items():
-    for n in val['names']:
+    for n in val["names"]:
        xml_types[n] = key
-xml_types['nodeStart'] = 1
+xml_types["nodeStart"] = 1
-xml_types['nodeEnd'] = 190
+xml_types["nodeEnd"] = 190
-xml_types['endSection'] = 191
+xml_types["endSection"] = 191
--- a/kbinxml/kbinxml.py
+++ b/kbinxml/kbinxml.py
@ -1,18 +1,14 @@
-# python 3 style, ints instead of b''
+import argparse
 from builtins import bytes
 from struct import calcsize
 import string
 import sys
 import operator
 import sys
 from io import BytesIO
 from struct import calcsize
 import lxml.etree as etree
 from .bytebuffer import ByteBuffer
 from .sixbit import pack_sixbit, unpack_sixbit
 from .format_ids import xml_formats, xml_types
-
+from .sixbit import pack_sixbit, unpack_sixbit
 stdout = getattr(sys.stdout, 'buffer', sys.stdout)
 DEBUG_OFFSETS = False
 DEBUG = False
@ -22,27 +18,42 @@ SIGNATURE = 0xA0
 SIG_COMPRESSED = 0x42
 SIG_UNCOMPRESSED = 0x45
-XML_ENCODING = 'UTF-8'
+XML_ENCODING = "UTF-8"
-BIN_ENCODING = 'SHIFT_JISX0213'
+BIN_ENCODING = "cp932"  # windows shift-jis variant
 # NOTE: all of these are their python codec names
 encoding_strings = {
-    0x20: 'ASCII',
+    0x00: "cp932",
-    0x00: 'ISO-8859-1',
+    0x20: "ASCII",
-    0x60: 'EUC_JP',
+    0x40: "ISO-8859-1",
-    0x80: 'SHIFT_JISX0213',
+    0x60: "EUC_JP",
-    0xA0: 'UTF-8'
+    0x80: "cp932",
    0xA0: "UTF-8",
 }
-encoding_vals = {val : key for key, val in encoding_strings.items()}
+encoding_vals = {val: key for key, val in encoding_strings.items()}
 # ensure that duplicated value from above is correct. Avoid exporting 0x00 type
 encoding_vals["cp932"] = 0x80
 def debug_print(string):
    if DEBUG:
        print(string)
 class KBinXML():
-    def __init__(self, input):
+class KBinException(Exception):
    pass
 class KBinXML:
    def __init__(self, input, convert_illegal_things=False):
        """If `convert_illegal_things` is true,
        - Any shift-jis string that cannot be decoded as shift-jis will
          try to be decoded as utf-8
        - If a node name is invalid (for example, it starts with a number),
          the name will be prefixed with an underscore
        """
        self.convert_illegal_things = convert_illegal_things
        if isinstance(input, etree._Element):
            self.xml_doc = input
        elif isinstance(input, etree._ElementTree):
@ -52,38 +63,119 @@ class KBinXML():
        else:
            self.from_text(input)
-    def to_text(self):
+    def to_text(self) -> str:
        # we decode again because I want unicode, dammit
-        return etree.tostring(self.xml_doc, pretty_print=True,
+        return etree.tostring(
-            encoding=XML_ENCODING, xml_declaration=True).decode(XML_ENCODING)
+            self.xml_doc, pretty_print=True, encoding=XML_ENCODING, xml_declaration=True
        ).decode(XML_ENCODING)
    def from_text(self, input):
        self.xml_doc = etree.parse(BytesIO(input)).getroot()
        self.encoding = XML_ENCODING
        self.compressed = True
        self.dataSize = None
    @staticmethod
    def is_binary_xml(input):
        if len(input) < 2:
            return False
        nodeBuf = ByteBuffer(input)
-        return (nodeBuf.get_u8() == SIGNATURE and
+        return nodeBuf.get_u8() == SIGNATURE and nodeBuf.get_u8() in (
-            nodeBuf.get_u8() in (SIG_COMPRESSED, SIG_UNCOMPRESSED))
+            SIG_COMPRESSED,
            SIG_UNCOMPRESSED,
        )
    @property
    def _data_mem_size(self):
        # This is probably better to be done in the parsing/writeout stage...
        data_len = 0
        for e in self.xml_doc.iter(tag=etree.Element):
            t = e.attrib.get("__type")
            if t is None:
                continue
            count = e.attrib.get("__count", 1)
            size = e.attrib.get("__size", 1)
            x = xml_formats[xml_types[t]]
            if x["count"] > 0:
                m = x["count"] * calcsize(x["type"]) * count * size
            elif x["name"] == "bin":
                m = len(e.text) // 2
            else:  # string
                # null terminator space
                m = len(e.text.encode(self.encoding)) + 1
            if m <= 4:
                continue
            if x["name"] == "bin":
                data_len += (m + 1) & ~1
            else:
                data_len += (m + 3) & ~3
        return data_len
    @property
    def mem_size(self):
        """used when allocating memory ingame"""
        data_len = self._data_mem_size
        node_count = len(list(self.xml_doc.iter(tag=etree.Element)))
        if self.compressed:
            size = 52 * node_count + data_len + 630
        else:
            tags_len = 0
            for e in self.xml_doc.iter(tag=etree.Element):
                e_len = max(len(e.tag), 8)
                e_len = (e_len + 3) & ~3
                tags_len += e_len
            size = 56 * node_count + data_len + 630 + tags_len
        # debugging
        # print('nodes:{} ({}) data:{} ({})'.format(node_count,hex(node_count), data_len, hex(data_len)))
        return (size + 8) & ~7
    def data_grab_auto(self):
        size = self.dataBuf.get_s32()
-        ret = self.dataBuf.get('B', size)
+        ret = self.dataBuf.get_bytes(size)
        self.dataBuf.realign_reads()
        return ret
    def data_append_auto(self, data):
        self.dataBuf.append_s32(len(data))
-        self.dataBuf.append(data, 'B', len(data))
+        self.dataBuf.append_bytes(data)
        self.dataBuf.realign_writes()
    def data_grab_string(self):
        data = self.data_grab_auto()
-        return bytes(data[:-1]).decode(self.encoding)
+        data = bytes(data[:-1])
        try:
            return data.decode(self.encoding)
        except UnicodeDecodeError as e:
            if self.encoding == "cp932":
                if not self.convert_illegal_things:
                    raise KBinException(
                        f"Could not decode string. To force utf8 decode {convert_illegal_help}."
                    ) from e
                # having to do this kinda sucks, but it's better than just giving up
                print(
                    "KBinXML: Malformed Shift-JIS string found, attempting UTF-8 decode",
                    file=sys.stderr,
                )
                print("KBinXML: Raw string data:", data, file=sys.stderr)
                return data.decode("utf8")
            else:
                # in the unlikely event of malformed data that isn't shift-jis,
                # fix it later
                raise
    def data_append_string(self, string):
-        string = bytes(string.encode(self.encoding) + b'\0')
+        string = bytes(string.encode(self.encoding) + b"\0")
        self.data_append_auto(string)
    # has its own separate state and other assorted garbage
@ -127,70 +219,99 @@ class KBinXML():
            self.dataBuf.append(data, type, count)
            self.dataBuf.realign_writes()
    def append_node_name(self, name):
        if self.compressed:
            pack_sixbit(name, self.nodeBuf)
        else:
            enc = name.encode(self.encoding)
            self.nodeBuf.append_u8((len(enc) - 1) | 64)
            self.nodeBuf.append_bytes(enc)
    def _add_namespace(self, node, name, value):
        """Add a namespace (xmlns) to an existing node. Returns the new node to
        work with"""
        # I wish this worked, but we need to specifiy it in the constructor
        # node.nsmap[name] = value
        ns = node.nsmap
        ns[name] = value
        old_node = node
        node = etree.Element(old_node.tag, nsmap=ns)
        node[:] = old_node[:]
        parent = old_node.getparent()
        if parent is not None:
            parent.remove(old_node)
            parent.append(node)
        return node
    def _node_to_binary(self, node):
-        nodeType = node.attrib.get('__type')
+        nodeType = node.attrib.get("__type")
        if not nodeType:
            # typeless tags with text become string
            if node.text is not None and len(node.text.strip()) > 0:
-                nodeType = 'str'
+                nodeType = "str"
            else:
-                nodeType = 'void'
+                nodeType = "void"
        nodeId = xml_types[nodeType]
        isArray = 0
-        count = node.attrib.get('__count')
+        count = node.attrib.get("__count")
        if count:
            count = int(count)
-            isArray = 64 # bit position for array flag
+            isArray = 64  # bit position for array flag
        self.nodeBuf.append_u8(nodeId | isArray)
        name = node.tag
-        pack_sixbit(name, self.nodeBuf)
+        self.append_node_name(name)
-        if nodeType != 'void':
+        if nodeType != "void":
            fmt = xml_formats[nodeId]
            val = node.text
-            if fmt['name'] == 'bin':
+            if fmt["name"] == "bin":
                data = bytes(bytearray.fromhex(val))
-            elif fmt['name'] == 'str':
+            elif fmt["name"] == "str":
-                if val is None: # empty string
+                if val is None:  # empty string
-                    val = ''
+                    val = ""
-                data = bytes(val.encode(self.encoding, 'replace') + b'\0')
+                data = bytes(val.encode(self.encoding, "replace") + b"\0")
            else:
-                val = val.split(' ')
+                val = val.split(" ")
-                data = list(map(fmt.get('fromStr', int), val))
+                data = list(map(fmt.get("fromStr", int), val))
-                if count and len(data) / fmt['count'] != count:
+                if count and len(data) / fmt["count"] != count:
-                    raise ValueError('Array length does not match __count attribute')
+                    raise ValueError("Array length does not match __count attribute")
-            if isArray or fmt['count'] == -1:
+            if isArray or fmt["count"] == -1:
-                self.dataBuf.append_u32(len(data) * calcsize(fmt['type']))
+                self.dataBuf.append_u32(len(data) * calcsize(fmt["type"]))
-                self.dataBuf.append(data, fmt['type'], len(data))
+                self.dataBuf.append(data, fmt["type"], len(data))
                self.dataBuf.realign_writes()
            else:
-                self.data_append_aligned(data, fmt['type'], fmt['count'])
+                self.data_append_aligned(data, fmt["type"], fmt["count"])
        # for test consistency and to be more faithful, sort the attrs
        sorted_attrs = sorted(node.attrib.items(), key=operator.itemgetter(0))
        for key, value in sorted_attrs:
-            if key not in ['__type', '__size', '__count']:
+            if key not in ["__type", "__size", "__count"]:
                self.data_append_string(value)
-                self.nodeBuf.append_u8(xml_types['attr'])
+                self.nodeBuf.append_u8(xml_types["attr"])
-                pack_sixbit(key, self.nodeBuf)
+                self.append_node_name(key)
        for child in node.iterchildren(tag=etree.Element):
            self._node_to_binary(child)
        # always has the isArray bit set
-        self.nodeBuf.append_u8(xml_types['nodeEnd'] | 64)
+        self.nodeBuf.append_u8(xml_types["nodeEnd"] | 64)
-    def to_binary(self):
+    def to_binary(self, encoding=BIN_ENCODING, compressed=True):
-        self.encoding = BIN_ENCODING
+        self.encoding = encoding
        self.compressed = compressed
        header = ByteBuffer()
        header.append_u8(SIGNATURE)
-        header.append_u8(SIG_COMPRESSED)
+        if self.compressed:
            header.append_u8(SIG_COMPRESSED)
        else:
            header.append_u8(SIG_UNCOMPRESSED)
        header.append_u8(encoding_vals[self.encoding])
        # Python's ints are big, so can't just bitwise invert
        header.append_u8(0xFF ^ encoding_vals[self.encoding])
@ -202,14 +323,15 @@ class KBinXML():
        self._node_to_binary(self.xml_doc)
        # always has the isArray bit set
-        self.nodeBuf.append_u8(xml_types['endSection'] | 64)
+        self.nodeBuf.append_u8(xml_types["endSection"] | 64)
        self.nodeBuf.realign_writes()
        header.append_u32(len(self.nodeBuf))
-        self.nodeBuf.append_u32(len(self.dataBuf))
+        self.dataSize = len(self.dataBuf)
        self.nodeBuf.append_u32(self.dataSize)
        return bytes(header.data + self.nodeBuf.data + self.dataBuf.data)
    def from_binary(self, input):
-        self.xml_doc = etree.Element('root')
+        self.xml_doc = etree.Element("root")
        node = self.xml_doc
        self.nodeBuf = ByteBuffer(input)
@ -227,7 +349,7 @@ class KBinXML():
        self.nodeBuf.end = nodeEnd
        self.dataBuf = ByteBuffer(input, nodeEnd)
-        dataSize = self.dataBuf.get_u32()
+        self.dataSize = self.dataBuf.get_u32()
        # This is all no fun
        self.dataByteBuf = ByteBuffer(input, nodeEnd)
        self.dataWordBuf = ByteBuffer(input, nodeEnd)
@ -242,88 +364,130 @@ class KBinXML():
            isArray = nodeType & 64
            nodeType &= ~64
-            nodeFormat = xml_formats.get(nodeType, {'name':'Unknown'})
+            nodeFormat = xml_formats.get(nodeType, {"name": "Unknown"})
-            debug_print('Node type is {} ({})'.format(nodeFormat['name'], nodeType))
+            debug_print("Node type is {} ({})".format(nodeFormat["name"], nodeType))
            # node or attribute name
-            name = ''
+            name = ""
-            if nodeType != xml_types['nodeEnd'] and nodeType != xml_types['endSection']:
+            if nodeType != xml_types["nodeEnd"] and nodeType != xml_types["endSection"]:
                if self.compressed:
                    name = unpack_sixbit(self.nodeBuf)
                else:
                    length = (self.nodeBuf.get_u8() & ~64) + 1
-                    name = self.nodeBuf.get('B', length)
+                    name = self.nodeBuf.get_bytes(length)
                    name = bytes(name).decode(self.encoding)
                debug_print(name)
            skip = True
-            if nodeType == xml_types['attr']:
+            if nodeType == xml_types["attr"]:
                value = self.data_grab_string()
-                node.attrib[name] = value
+                # because someone thought it was a good idea to serialise namespaces
-            elif nodeType == xml_types['nodeEnd']:
+                if name.startswith("xmlns:"):
                    _, name = name.split("xmlns:")
                    node = self._add_namespace(node, name, value)
                elif ":" in name:
                    prefix, name = name.split(":")
                    # if this fails, the xml is invalid. Open an issue.
                    node.set(etree.QName(node.nsmap[prefix], name), value)
                # this is the case you'll get in 99% of places
                else:
                    node.attrib[name] = value
            elif nodeType == xml_types["nodeEnd"]:
                if node.getparent() is not None:
                    node = node.getparent()
-            elif nodeType == xml_types['endSection']:
+            elif nodeType == xml_types["endSection"]:
                nodesLeft = False
            elif nodeType not in xml_formats:
-                raise NotImplementedError('Implement node {}'.format(nodeType))
+                raise NotImplementedError("Implement node {}".format(nodeType))
-            else: # inner value to process
+            else:  # inner value to process
                skip = False
            if skip:
                continue
-            child = etree.SubElement(node, name)
+            try:
                child = etree.SubElement(node, name)
            except ValueError as e:
                fixed_name = f"_{name}"
                if self.convert_illegal_things:
                    # todo: there are other invalid node names. Fix them when you see them.
                    child = etree.SubElement(node, fixed_name)
                else:
                    raise KBinException(
                        f'Could not create node with name "{name}". To rename it to "{fixed_name}", {convert_illegal_help}.'
                    ) from e
            node = child
-            if nodeType == xml_types['nodeStart']:
+            if nodeType == xml_types["nodeStart"]:
                continue
-            node.attrib['__type'] = nodeFormat['name']
+            node.attrib["__type"] = nodeFormat["name"]
-            varCount = nodeFormat['count']
+            varCount = nodeFormat["count"]
            arrayCount = 1
-            if varCount == -1: # the 2 cannot be combined
+            if varCount == -1:  # the 2 cannot be combined
                varCount = self.dataBuf.get_u32()
                isArray = True
            elif isArray:
-                arrayCount = self.dataBuf.get_u32() // (calcsize(nodeFormat['type'] * varCount))
+                arrayCount = self.dataBuf.get_u32() // (
-                node.attrib['__count'] = str(arrayCount)
+                    calcsize(nodeFormat["type"] * varCount)
                )
                node.attrib["__count"] = str(arrayCount)
            totalCount = arrayCount * varCount
            if isArray:
-                data = self.dataBuf.get(nodeFormat['type'], totalCount)
+                data = self.dataBuf.get(nodeFormat["type"], totalCount)
                self.dataBuf.realign_reads()
            else:
-                data = self.data_grab_aligned(nodeFormat['type'], totalCount)
+                data = self.data_grab_aligned(nodeFormat["type"], totalCount)
-            if nodeType == xml_types['binary']:
+            if nodeType == xml_types["binary"]:
-                node.attrib['__size'] = str(totalCount)
+                node.attrib["__size"] = str(totalCount)
-                string = ''.join(('{0:02x}'.format(x) for x in data))
+                string = "".join(("{0:02x}".format(x) for x in data))
-            elif nodeType == xml_types['string']:
+            elif nodeType == xml_types["string"]:
                string = bytes(data[:-1]).decode(self.encoding)
            else:
-                string = ' '.join(map(nodeFormat.get('toStr', str), data))
+                string = " ".join(map(nodeFormat.get("toStr", str), data))
-            node.text = string
+            # some strings have extra NUL bytes, compatible behaviour is to strip
            node.text = string.strip("\0")
        # because we need the 'real' root
        self.xml_doc = self.xml_doc[0]
 def main():
    if len(sys.argv) != 2:
        print('bin_xml.py file.[xml/bin]')
        exit()
-    with open(sys.argv[1], 'rb') as f:
+convert_illegal_help = "set convert_illegal_things=True in the KBinXML constructor"
 def main():
    # interestingly, this doesn't work if added inside the
    # `if __name__ == "__main__"` branch
    global convert_illegal_help
    convert_illegal_help = "add the --convert-illegal flag"
    parser = argparse.ArgumentParser(
        prog="kbinxml", description="Convert kbin to xml, or xml to kbin"
    )
    parser.add_argument("filename", metavar="file.[xml/bin]")
    parser.add_argument("--convert-illegal", action="store_true")
    args = parser.parse_args()
    with open(args.filename, "rb") as f:
        input = f.read()
-    xml = KBinXML(input)
+    xml = KBinXML(input, convert_illegal_things=args.convert_illegal)
-    if KBinXML.is_binary_xml(input):
+    stdout = getattr(sys.stdout, "buffer", sys.stdout)
-        stdout.write(xml.to_text().encode('utf-8'))
+    try:
-    else:
+        if KBinXML.is_binary_xml(input):
-        stdout.write(xml.to_binary())
+            stdout.write(xml.to_text().encode("utf-8"))
        else:
            stdout.write(xml.to_binary())
    except BrokenPipeError:
        # allows kbinxml to be piped to `head` or similar
        sys.exit(141)
-if __name__ == '__main__':
+
 if __name__ == "__main__":
    main()
--- a/kbinxml/sixbit.py
+++ b/kbinxml/sixbit.py
@ -1,56 +1,36 @@
-# python 3 style, ints instead of b''
+from kbinxml.bytebuffer import ByteBuffer
 from builtins import bytes
 from bitarray import bitarray
-def pack_sixbit(string, byteBuf):
+
-    chars = str_to_sixbit(string)
+charmap = "0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"
-    bits = bitarray(endian='big')
+bytemap = {c: i for i, c in enumerate(charmap)}
 def pack_sixbit(string: str, byteBuf: ByteBuffer):
    chars = [bytemap[x] for x in string]
    padding = 8 - (len(string) * 6 % 8)
    if padding == 8:
        padding = 0
    bits = 0
    for c in chars:
-        bits.frombytes(c.encode())
+        bits <<= 6
-        # leave only the 6 bits we care for
+        bits |= c
-        del bits[-8:-6]
+    bits <<= padding
-    data = bytes(bits.tobytes())
+    data = bits.to_bytes((len(string) * 6 + padding) // 8, byteorder="big")
-    byteBuf.append_u8(len(string))
+    byteBuf.append_bytes((len(string),))
-    byteBuf.append(data, 'B', len(data))
+    byteBuf.append_bytes(data)
-def unpack_sixbit(byteBuf):
+
 def unpack_sixbit(byteBuf: ByteBuffer):
    length = byteBuf.get_u8()
-    length_bytes = (length * 6 + 7) // 8
+    length_bits = length * 6
-    bitBuf = bitarray(endian='big')
+    length_bytes = (length_bits + 7) // 8
-    bitBuf.frombytes(bytes(byteBuf.get('B', length_bytes)))
+    padding = 8 - (length_bits % 8)
    if padding == 8:
        padding = 0
    bits = int.from_bytes(byteBuf.get_bytes(length_bytes), byteorder="big")
    bits >>= padding
    result = []
-    offset = 0
+    for _ in range(length):
-    for i in range(length):
+        result.append(bits & 0b111111)
-        result.append(ord(bitBuf[offset:offset+6].tobytes()) >> (8 - 6))
+        bits >>= 6
-        offset += 6
+    return "".join([charmap[x] for x in result[::-1]])
    return sixbit_to_str(result)
 # 0-9 for numbers, 10 is ':', 11 to 36 for capitals, 37 for underscore, 38-63 for lowercase
 def sixbit_to_str(decompressed):
    string = ''
    for d in decompressed:
        if d <= 10:
            d += ord('0')
        elif d < 37:
            d += 54
        elif d == 37:
            d += 58
        else:
            d += 59
        string += chr(d)
    return string
 def str_to_sixbit(string):
    compress = []
    for c in string:
        if c >= '0' and c <= ':':
            compress.append(ord(c) - ord('0'))
        elif c >= 'A' and c <= 'Z':
            compress.append(ord(c) - 54)
        elif c == '_':
            compress.append(ord(c) - 58)
        elif c >= 'a' and c <= 'z':
            compress.append(ord(c) - 59)
        else:
            raise ValueError('Node or attribute name can only contain alphanumeric + underscore')
    return ''.join(map(chr, compress))
--- a/kbinxml/test.py
+++ b/kbinxml/test.py
@ -1,28 +1,26 @@
 from .kbinxml import KBinXML
 # python 2/3 cross compat
 from io import open
-with open('testcases.xml', 'rb') as f:
+with open("testcases.xml", "rb") as f:
    xml_in = f.read()
-with open('testcases_out.xml', 'r', encoding='UTF-8') as f:
+with open("testcases_out.xml", "r", encoding="UTF-8") as f:
    expected_xml = f.read()
-with open('testcases_out.kbin', 'rb') as f:
+with open("testcases_out.kbin", "rb") as f:
    expected_bin = f.read()
 k = KBinXML(xml_in)
 kbin = k.to_binary()
 if kbin != expected_bin:
-    with open('failed_test.kbin', 'wb') as f:
+    with open("failed_test.kbin", "wb") as f:
        f.write(kbin)
-    raise AssertionError('Binary output does not match, check failed_test.kbin')
+    raise AssertionError("Binary output does not match, check failed_test.kbin")
 else:
-    print('XML -> Binary correct!')
+    print("XML -> Binary correct!")
 backwards = KBinXML(kbin)
 btext = backwards.to_text()
 if btext != expected_xml:
-    with open('failed_test.xml', 'w', encoding='UTF-8') as f:
+    with open("failed_test.xml", "w", encoding="UTF-8") as f:
        f.write(btext)
-    raise AssertionError('XML putput does not match, check failed_test.xml')
+    raise AssertionError("XML putput does not match, check failed_test.xml")
 else:
-    print('Binary -> XML correct!')
+    print("Binary -> XML correct!")
--- a/setup.py
+++ b/setup.py
@ -1,23 +1,24 @@
 from setuptools import setup
 import sys
 requires = [
-        'bitarray',
+    "lxml",
        'lxml',
 ]
 if sys.version_info < (3,0):
    requires.append('future')
 python_requires = ">=3.10"
 version = "2.1"
 setup(
-    name='kbinxml',
+    name="kbinxml",
-    version='1.1',
+    description="Decoder/encoder for Konami's binary XML format",
-    entry_points = {
+    long_description="See Github for up to date documentation",
-        'console_scripts': ['kbinxml=kbinxml:main'],
+    version=version,
    entry_points={
        "console_scripts": ["kbinxml=kbinxml:main"],
    },
-    packages=['kbinxml'],
+    packages=["kbinxml"],
-    url='https://github.com/mon/kbinxml/',
+    url="https://github.com/mon/kbinxml/",
-    author='mon',
+    download_url="https://github.com/mon/kbinxml/archive/{}.tar.gz".format(version),
-    author_email='me@mon.im',
+    author="mon",
-    install_requires=requires
+    author_email="me@mon.im",
    install_requires=requires,
 )
--- a/testcases.xml
+++ b/testcases.xml
@ -8,9 +8,11 @@
    <!-- Testing encoding, plus __type-less should become string -->
    <superstar babe="ミツル">シ　イス　マイ　ワイフ</superstar>
    <!-- Testing 6bit conversion -->
-    <xXx_T4GG3R_xXx __type="3u8">8 9 10</xXx_T4GG3R_xXx>
+    <xXx_T4GG3R_xXx __type="2u8">8 9</xXx_T4GG3R_xXx>
-    <!-- Shouldn't have alignment issues from the 3u8 -->
+    <!-- Shouldn't have alignment issues from the 2u8 -->
    <aligned __type="u8">12</aligned>
    <!-- Array contents are never packed -->
    <aligned_arr __type="u8" __count="1">13</aligned_arr>
    <!-- Binary parsing -->
    <entry __type="binary">DEADBEEF</entry>
    <!-- Lowercase, too -->
--- a/testcases_out.kbin
+++ b/testcases_out.kbin
--- a/testcases_out.xml
+++ b/testcases_out.xml
@ -3,8 +3,9 @@
  <entry __type="ip4" __count="2">127.0.0.1 192.168.0.1</entry>
  <entry __type="str" attr="test" attr2="best">Hello, world!</entry>
  <superstar __type="str" babe="ミツル">シ　イス　マイ　ワイフ</superstar>
-  <xXx_T4GG3R_xXx __type="3u8">8 9 10</xXx_T4GG3R_xXx>
+  <xXx_T4GG3R_xXx __type="2u8">8 9</xXx_T4GG3R_xXx>
  <aligned __type="u8">12</aligned>
  <aligned_arr __type="u8" __count="1">13</aligned_arr>
  <entry __type="bin" __size="4">deadbeef</entry>
  <entry __type="bin" __size="4">deadbe7a</entry>
  <entry __type="3u8" __count="2">1 2 3 1 2 3</entry>
Author	SHA1	Message	Date
William Toohey	1b6a3580cc	Fix BrokenPipeError when piping kbinxml output	2024-04-28 10:54:05 +10:00
William Toohey	b1cb927fd2	Remove py2 support, format code, add type annotations, convert illegal node names	2023-10-26 18:02:31 +10:00
William Toohey	3a7f038c70	Merge branch 'master' of https://github.com/mon/kbinxml	2023-07-13 21:09:24 +10:00
William Toohey	ae7c172820	Be more lenient when decoding malformed strings	2023-07-13 21:09:15 +10:00
Shiz	718d4c8478	testcases: update output kbin for array alignmen test git add -p is deceptive, whoops.	2023-06-03 11:22:20 +10:00
Will	ce3c47325f	Merge pull request #7 from Shizmob/feature/array-alignment-testcase testcases: add array packing/alignment test case	2023-06-03 10:11:54 +10:00
Shiz	48db748632	testcases: add array packing/alignment test case array reads are never taken from the packed buffers, even if the total array size would easily fit in them.	2023-06-02 14:40:21 +02:00
William Toohey	b847de5453	Bump version to 1.7	2021-05-03 22:33:39 +10:00
William Toohey	5707a6bf8b	is_binary_xml: don't explode on too-short inputs	2021-05-03 22:32:27 +10:00
William Toohey	3cbc3179a8	Support namespace deserialisation	2020-07-10 10:23:36 +10:00
William Toohey	b1339d6c4b	Fix strings with extra NUL padding	2020-07-10 09:26:55 +10:00
William Toohey	ca4a6e309e	re-add 0x00 encoding	2018-06-19 16:26:43 +10:00
William Toohey	8784a76672	ISO 8859 was also wrong	2018-06-15 15:58:32 +10:00
William Toohey	68166ea9ab	I was using the wrong encoding the whole time	2018-06-15 15:08:42 +10:00
William Toohey	fed34e0943	Fix python 2 (again)	2018-05-31 15:23:29 +10:00
William Toohey	042323917c	Drop bitarray requirement	2018-05-25 23:59:55 +10:00
William Toohey	dccacdb6e6	Fix python2 (again)	2018-01-30 11:16:21 +10:00
William Toohey	50fa4b8d23	Support writing compressed tag names	2018-01-10 22:24:59 +10:00
William Toohey	a4eb17c70b	Speed improvements	2018-01-09 21:25:56 +10:00
William Toohey	acd3a4195d	PyPi!	2018-01-09 17:05:30 +10:00
William Toohey	1fc6b47484	In-memory size calculation	2018-01-09 16:55:13 +10:00
Will	a77eee91db	Create LICENSE	2018-01-07 22:41:52 +10:00
William Toohey	4e53144e42	Simpler install guide	2018-01-05 01:56:41 +10:00