Fix BrokenPipeError when piping kbinxml output

Remove py2 support, format code, add type annotations, convert illegal node names
Merge branch 'master' of https://github.com/mon/kbinxml
2026-03-22 02:15:37 -05:00 · 2024-04-28 10:54:05 +10:00 · 2023-10-26 18:02:31 +10:00 · 2023-07-13 21:09:24 +10:00 · 2023-07-13 21:09:15 +10:00 · 2023-06-03 11:22:20 +10:00
13 changed files with 485 additions and 241 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 *.pyc
 dist/
 kbinxml.egg-info/
+build/
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@ -0,0 +1,3 @@
+{
+    "editor.formatOnSave": true
+}
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 mon
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -3,8 +3,11 @@
 An encoder/decoder for Konami's binary XML format, used in some of their games.

 ### Setup:
-`pip install .`
+`pip install kbinxml`

+You can use `kbinxml` from the commandline to convert files.
+
+Python usage:  
 ```python
 In [1]: from kbinxml import KBinXML
 In [2]: text = KBinXML('<?xml version="1.0"?><root __type="str">Hello, world!</root>')
@ -14,6 +17,4 @@ Out[4]: b'\xa0B\x80\x7f\x00\x00\x00\x08\x0b\x04\xdfM9\xfe\xff\x00\x00\x00\x00\x1
 In [5]: bin = KBinXML(Out[4])
 In [6]: bin.to_text()
 Out[7]: u'<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<root __type="str">Hello, world!</root>\n'
-```
-
-You can also use `kbinxml` from the commandline to convert files.
+```
--- a/kbinxml/bytebuffer.py
+++ b/kbinxml/bytebuffer.py
@ -1,26 +1,33 @@
 from struct import *
+from typing import Any

-class ByteBuffer():
-    def __init__(self, input = b'', offset = 0, endian = '>'):
+
+class ByteBuffer:
+    def __init__(self, input: bytes | bytearray | str = b"", offset=0, endian=">"):
        # so multiple ByteBuffers can hold on to one set of underlying data
        # this is useful for writers in multiple locations
        if isinstance(input, bytearray):
            self.data = input
        else:
            if not isinstance(input, bytes):
-                input = input.encode('utf-8')
+                input = input.encode("utf-8")
            self.data = bytearray(input)
        self.endian = endian
        self.offset = offset
        self.end = len(self.data)

-    def _format_type(self, type, count):
+    def _format_type(self, type: str, count: int | None = None):
        if count is None:
            return self.endian + type
        else:
            return self.endian + str(count) + type

-    def get(self, type, count = None):
+    def get_bytes(self, count: int):
+        start = self.offset
+        self.offset += count
+        return self.data[start : self.offset]
+
+    def get(self, type: str, count: int | None = None):
        ret = self.peek(type, count)
        size = calcsize(type)
        if count is not None:
@ -28,12 +35,16 @@ class ByteBuffer():
        self.offset += size
        return ret

-    def peek(self, type, count = None):
+    def peek(self, type: str, count: int | None = None):
        fmt = self._format_type(type, count)
        ret = unpack_from(fmt, self.data, self.offset)
        return ret[0] if count is None else ret

-    def append(self, data, type, count = None):
+    def append_bytes(self, data: bytes):
+        self.data.extend(data)
+        self.offset += len(data)
+
+    def append(self, data: Any, type: str, count: int | None = None):
        fmt = self._format_type(type, count)
        self.offset += calcsize(fmt)
        try:
@ -41,7 +52,7 @@ class ByteBuffer():
        except TypeError:
            self.data.extend(pack(fmt, data))

-    def set(self, data, offset, type, count = None):
+    def set(self, data: Any, offset: int, type: str, count: int | None = None):
        fmt = self._format_type(type, count)
        try:
            pack_into(fmt, self.data, offset, *data)
@ -52,54 +63,109 @@ class ByteBuffer():
    def hasData(self):
        return self.offset < self.end

-    def realign_writes(self, size = 4):
+    def realign_writes(self, size=4):
        while len(self) % size:
            self.append_u8(0)

-    def realign_reads(self, size = 4):
+    def realign_reads(self, size=4):
        while self.offset % size:
            self.offset += 1

    def __len__(self):
        return len(self.data)

-typeMap = {
-    's8'  : 'b',
-    's16' : 'h',
-    's32' : 'i',
-    's64' : 'q',
-    'u8'  : 'B',
-    'u16' : 'H',
-    'u32' : 'I',
-    'u64' : 'Q'
-}
+    def get_s8(self) -> int:
+        return self.get("b")

-def _make_get(fmt):
-    def _method(self):
-        return self.get(fmt)
-    return _method
+    def peek_s8(self) -> int:
+        return self.peek("b")

-def _make_peek(fmt):
-    def _method(self):
-        return self.peek(fmt)
-    return _method
+    def append_s8(self, data: int):
+        return self.append(data, "b")

-def _make_append(fmt):
-    def _method(self, data):
-        return self.append(data, fmt)
-    return _method
+    def set_s8(self, data: int, offset: int):
+        return self.set(data, offset, "b")

-def _make_set(fmt):
-    def _method(self, data, offset):
-        return self.set(data, offset, fmt)
-    return _method
+    def get_s16(self) -> int:
+        return self.get("h")

-for name, fmt in typeMap.items():
-    _get = _make_get(fmt)
-    _peek = _make_peek(fmt)
-    _append = _make_append(fmt)
-    _set = _make_set(fmt)
-    setattr(ByteBuffer, 'get_' + name, _get)
-    setattr(ByteBuffer, 'peek_' + name, _peek)
-    setattr(ByteBuffer, 'append_' + name, _append)
-    setattr(ByteBuffer, 'set_' + name, _set)
+    def peek_s16(self) -> int:
+        return self.peek("h")
+
+    def append_s16(self, data: int):
+        return self.append(data, "h")
+
+    def set_s16(self, data: int, offset: int):
+        return self.set(data, offset, "h")
+
+    def get_s32(self) -> int:
+        return self.get("i")
+
+    def peek_s32(self) -> int:
+        return self.peek("i")
+
+    def append_s32(self, data: int):
+        return self.append(data, "i")
+
+    def set_s32(self, data: int, offset: int):
+        return self.set(data, offset, "i")
+
+    def get_s64(self) -> int:
+        return self.get("q")
+
+    def peek_s64(self) -> int:
+        return self.peek("q")
+
+    def append_s64(self, data: int):
+        return self.append(data, "q")
+
+    def set_s64(self, data: int, offset: int):
+        return self.set(data, offset, "q")
+
+    def get_u8(self) -> int:
+        return self.get("B")
+
+    def peek_u8(self) -> int:
+        return self.peek("B")
+
+    def append_u8(self, data: int):
+        return self.append(data, "B")
+
+    def set_u8(self, data: int, offset: int):
+        return self.set(data, offset, "B")
+
+    def get_u16(self) -> int:
+        return self.get("H")
+
+    def peek_u16(self) -> int:
+        return self.peek("H")
+
+    def append_u16(self, data: int):
+        return self.append(data, "H")
+
+    def set_u16(self, data: int, offset: int):
+        return self.set(data, offset, "H")
+
+    def get_u32(self) -> int:
+        return self.get("I")
+
+    def peek_u32(self) -> int:
+        return self.peek("I")
+
+    def append_u32(self, data: int):
+        return self.append(data, "I")
+
+    def set_u32(self, data: int, offset: int):
+        return self.set(data, offset, "I")
+
+    def get_u64(self) -> int:
+        return self.get("Q")
+
+    def peek_u64(self) -> int:
+        return self.peek("Q")
+
+    def append_u64(self, data: int):
+        return self.append(data, "Q")
+
+    def set_u64(self, data: int, offset: int):
+        return self.set(data, offset, "Q")
--- a/kbinxml/format_ids.py
+++ b/kbinxml/format_ids.py
@ -1,22 +1,27 @@
 from struct import pack, unpack

-def parseIP(string):
-    bunch = map(int, string.split('.'))
-    # pack to bytes
-    p = pack('4B', *bunch)
-    # unpack as u16
-    return unpack('>I', p)[0]

-def writeIP(raw):
+def parseIP(string: str) -> int:
+    bunch = map(int, string.split("."))
    # pack to bytes
-    p = pack('>I', raw)
+    p = pack("4B", *bunch)
+    # unpack as u32
+    return unpack(">I", p)[0]
+
+
+def writeIP(raw: int):
+    # pack to bytes
+    p = pack(">I", raw)
    # unpack
-    return '.'.join(map(str, unpack('4B', p)))
+    return ".".join(map(str, unpack("4B", p)))

-def writeFloat(raw):
+
+def writeFloat(raw: float):
    # this is just how floats get printed...
-    return '{0:.6f}'.format(raw)
+    return f"{raw:.6f}"

+
+# fmt: off
 xml_formats = {
    1  : { 'names' : ['void']},
    2  : { 'type' : 'b',  'count' : 1,  'names' : ['s8']},
@ -75,15 +80,16 @@ xml_formats = {
    55 : { 'type' : 'b',  'count' : 4,  'names' : ['4b']},
    56 : { 'type' : 'b',  'count' : 16, 'names' : ['vb']}
 }
+# fmt: on

 # little less boilerplate for writing
 for key, val in xml_formats.items():
-    xml_formats[key]['name'] = xml_formats[key]['names'][0]
+    xml_formats[key]["name"] = xml_formats[key]["names"][0]

-xml_types = {}
+xml_types: dict[str, int] = {}
 for key, val in xml_formats.items():
-    for n in val['names']:
+    for n in val["names"]:
        xml_types[n] = key
-xml_types['nodeStart'] = 1
-xml_types['nodeEnd'] = 190
-xml_types['endSection'] = 191
+xml_types["nodeStart"] = 1
+xml_types["nodeEnd"] = 190
+xml_types["endSection"] = 191
--- a/kbinxml/kbinxml.py
+++ b/kbinxml/kbinxml.py
@ -1,18 +1,14 @@
-# python 3 style, ints instead of b''
-from builtins import bytes
-from struct import calcsize
-import string
-import sys
+import argparse
 import operator
+import sys
 from io import BytesIO
+from struct import calcsize

 import lxml.etree as etree

 from .bytebuffer import ByteBuffer
-from .sixbit import pack_sixbit, unpack_sixbit
 from .format_ids import xml_formats, xml_types
-
-stdout = getattr(sys.stdout, 'buffer', sys.stdout)
+from .sixbit import pack_sixbit, unpack_sixbit

 DEBUG_OFFSETS = False
 DEBUG = False
@ -22,27 +18,42 @@ SIGNATURE = 0xA0
 SIG_COMPRESSED = 0x42
 SIG_UNCOMPRESSED = 0x45

-XML_ENCODING = 'UTF-8'
-BIN_ENCODING = 'SHIFT_JISX0213'
+XML_ENCODING = "UTF-8"
+BIN_ENCODING = "cp932"  # windows shift-jis variant

 # NOTE: all of these are their python codec names
 encoding_strings = {
-    0x20: 'ASCII',
-    0x00: 'ISO-8859-1',
-    0x60: 'EUC_JP',
-    0x80: 'SHIFT_JISX0213',
-    0xA0: 'UTF-8'
+    0x00: "cp932",
+    0x20: "ASCII",
+    0x40: "ISO-8859-1",
+    0x60: "EUC_JP",
+    0x80: "cp932",
+    0xA0: "UTF-8",
 }

-encoding_vals = {val : key for key, val in encoding_strings.items()}
+encoding_vals = {val: key for key, val in encoding_strings.items()}
+# ensure that duplicated value from above is correct. Avoid exporting 0x00 type
+encoding_vals["cp932"] = 0x80
+

 def debug_print(string):
    if DEBUG:
        print(string)

-class KBinXML():

-    def __init__(self, input):
+class KBinException(Exception):
+    pass
+
+
+class KBinXML:
+    def __init__(self, input, convert_illegal_things=False):
+        """If `convert_illegal_things` is true,
+        - Any shift-jis string that cannot be decoded as shift-jis will
+          try to be decoded as utf-8
+        - If a node name is invalid (for example, it starts with a number),
+          the name will be prefixed with an underscore
+        """
+        self.convert_illegal_things = convert_illegal_things
        if isinstance(input, etree._Element):
            self.xml_doc = input
        elif isinstance(input, etree._ElementTree):
@ -52,38 +63,119 @@ class KBinXML():
        else:
            self.from_text(input)

-    def to_text(self):
+    def to_text(self) -> str:
        # we decode again because I want unicode, dammit
-        return etree.tostring(self.xml_doc, pretty_print=True,
-            encoding=XML_ENCODING, xml_declaration=True).decode(XML_ENCODING)
+        return etree.tostring(
+            self.xml_doc, pretty_print=True, encoding=XML_ENCODING, xml_declaration=True
+        ).decode(XML_ENCODING)

    def from_text(self, input):
        self.xml_doc = etree.parse(BytesIO(input)).getroot()
        self.encoding = XML_ENCODING
+        self.compressed = True
+        self.dataSize = None

    @staticmethod
    def is_binary_xml(input):
+        if len(input) < 2:
+            return False
+
        nodeBuf = ByteBuffer(input)
-        return (nodeBuf.get_u8() == SIGNATURE and
-            nodeBuf.get_u8() in (SIG_COMPRESSED, SIG_UNCOMPRESSED))
+        return nodeBuf.get_u8() == SIGNATURE and nodeBuf.get_u8() in (
+            SIG_COMPRESSED,
+            SIG_UNCOMPRESSED,
+        )
+
+    @property
+    def _data_mem_size(self):
+        # This is probably better to be done in the parsing/writeout stage...
+
+        data_len = 0
+        for e in self.xml_doc.iter(tag=etree.Element):
+            t = e.attrib.get("__type")
+            if t is None:
+                continue
+
+            count = e.attrib.get("__count", 1)
+            size = e.attrib.get("__size", 1)
+            x = xml_formats[xml_types[t]]
+            if x["count"] > 0:
+                m = x["count"] * calcsize(x["type"]) * count * size
+            elif x["name"] == "bin":
+                m = len(e.text) // 2
+            else:  # string
+                # null terminator space
+                m = len(e.text.encode(self.encoding)) + 1
+
+            if m <= 4:
+                continue
+
+            if x["name"] == "bin":
+                data_len += (m + 1) & ~1
+            else:
+                data_len += (m + 3) & ~3
+        return data_len
+
+    @property
+    def mem_size(self):
+        """used when allocating memory ingame"""
+
+        data_len = self._data_mem_size
+        node_count = len(list(self.xml_doc.iter(tag=etree.Element)))
+
+        if self.compressed:
+            size = 52 * node_count + data_len + 630
+        else:
+            tags_len = 0
+            for e in self.xml_doc.iter(tag=etree.Element):
+                e_len = max(len(e.tag), 8)
+                e_len = (e_len + 3) & ~3
+                tags_len += e_len
+
+            size = 56 * node_count + data_len + 630 + tags_len
+
+        # debugging
+        # print('nodes:{} ({}) data:{} ({})'.format(node_count,hex(node_count), data_len, hex(data_len)))
+
+        return (size + 8) & ~7

    def data_grab_auto(self):
        size = self.dataBuf.get_s32()
-        ret = self.dataBuf.get('B', size)
+        ret = self.dataBuf.get_bytes(size)
        self.dataBuf.realign_reads()
        return ret

    def data_append_auto(self, data):
        self.dataBuf.append_s32(len(data))
-        self.dataBuf.append(data, 'B', len(data))
+        self.dataBuf.append_bytes(data)
        self.dataBuf.realign_writes()

    def data_grab_string(self):
        data = self.data_grab_auto()
-        return bytes(data[:-1]).decode(self.encoding)
+        data = bytes(data[:-1])
+        try:
+            return data.decode(self.encoding)
+        except UnicodeDecodeError as e:
+            if self.encoding == "cp932":
+                if not self.convert_illegal_things:
+                    raise KBinException(
+                        f"Could not decode string. To force utf8 decode {convert_illegal_help}."
+                    ) from e
+
+                # having to do this kinda sucks, but it's better than just giving up
+                print(
+                    "KBinXML: Malformed Shift-JIS string found, attempting UTF-8 decode",
+                    file=sys.stderr,
+                )
+                print("KBinXML: Raw string data:", data, file=sys.stderr)
+                return data.decode("utf8")
+            else:
+                # in the unlikely event of malformed data that isn't shift-jis,
+                # fix it later
+                raise

    def data_append_string(self, string):
-        string = bytes(string.encode(self.encoding) + b'\0')
+        string = bytes(string.encode(self.encoding) + b"\0")
        self.data_append_auto(string)

    # has its own separate state and other assorted garbage
@ -127,70 +219,99 @@ class KBinXML():
            self.dataBuf.append(data, type, count)
            self.dataBuf.realign_writes()

+    def append_node_name(self, name):
+        if self.compressed:
+            pack_sixbit(name, self.nodeBuf)
+        else:
+            enc = name.encode(self.encoding)
+            self.nodeBuf.append_u8((len(enc) - 1) | 64)
+            self.nodeBuf.append_bytes(enc)
+
+    def _add_namespace(self, node, name, value):
+        """Add a namespace (xmlns) to an existing node. Returns the new node to
+        work with"""
+
+        # I wish this worked, but we need to specifiy it in the constructor
+        # node.nsmap[name] = value
+        ns = node.nsmap
+        ns[name] = value
+        old_node = node
+        node = etree.Element(old_node.tag, nsmap=ns)
+        node[:] = old_node[:]
+        parent = old_node.getparent()
+        if parent is not None:
+            parent.remove(old_node)
+            parent.append(node)
+        return node
+
    def _node_to_binary(self, node):
-        nodeType = node.attrib.get('__type')
+        nodeType = node.attrib.get("__type")
        if not nodeType:
            # typeless tags with text become string
            if node.text is not None and len(node.text.strip()) > 0:
-                nodeType = 'str'
+                nodeType = "str"
            else:
-                nodeType = 'void'
+                nodeType = "void"
        nodeId = xml_types[nodeType]

        isArray = 0
-        count = node.attrib.get('__count')
+        count = node.attrib.get("__count")
        if count:
            count = int(count)
-            isArray = 64 # bit position for array flag
+            isArray = 64  # bit position for array flag

        self.nodeBuf.append_u8(nodeId | isArray)

        name = node.tag
-        pack_sixbit(name, self.nodeBuf)
+        self.append_node_name(name)

-        if nodeType != 'void':
+        if nodeType != "void":
            fmt = xml_formats[nodeId]

            val = node.text
-            if fmt['name'] == 'bin':
+            if fmt["name"] == "bin":
                data = bytes(bytearray.fromhex(val))
-            elif fmt['name'] == 'str':
-                if val is None: # empty string
-                    val = ''
-                data = bytes(val.encode(self.encoding, 'replace') + b'\0')
+            elif fmt["name"] == "str":
+                if val is None:  # empty string
+                    val = ""
+                data = bytes(val.encode(self.encoding, "replace") + b"\0")
            else:
-                val = val.split(' ')
-                data = list(map(fmt.get('fromStr', int), val))
-                if count and len(data) / fmt['count'] != count:
-                    raise ValueError('Array length does not match __count attribute')
+                val = val.split(" ")
+                data = list(map(fmt.get("fromStr", int), val))
+                if count and len(data) / fmt["count"] != count:
+                    raise ValueError("Array length does not match __count attribute")

-            if isArray or fmt['count'] == -1:
-                self.dataBuf.append_u32(len(data) * calcsize(fmt['type']))
-                self.dataBuf.append(data, fmt['type'], len(data))
+            if isArray or fmt["count"] == -1:
+                self.dataBuf.append_u32(len(data) * calcsize(fmt["type"]))
+                self.dataBuf.append(data, fmt["type"], len(data))
                self.dataBuf.realign_writes()
            else:
-                self.data_append_aligned(data, fmt['type'], fmt['count'])
+                self.data_append_aligned(data, fmt["type"], fmt["count"])

        # for test consistency and to be more faithful, sort the attrs
        sorted_attrs = sorted(node.attrib.items(), key=operator.itemgetter(0))
        for key, value in sorted_attrs:
-            if key not in ['__type', '__size', '__count']:
+            if key not in ["__type", "__size", "__count"]:
                self.data_append_string(value)
-                self.nodeBuf.append_u8(xml_types['attr'])
-                pack_sixbit(key, self.nodeBuf)
+                self.nodeBuf.append_u8(xml_types["attr"])
+                self.append_node_name(key)

        for child in node.iterchildren(tag=etree.Element):
            self._node_to_binary(child)

        # always has the isArray bit set
-        self.nodeBuf.append_u8(xml_types['nodeEnd'] | 64)
+        self.nodeBuf.append_u8(xml_types["nodeEnd"] | 64)

-    def to_binary(self):
-        self.encoding = BIN_ENCODING
+    def to_binary(self, encoding=BIN_ENCODING, compressed=True):
+        self.encoding = encoding
+        self.compressed = compressed

        header = ByteBuffer()
        header.append_u8(SIGNATURE)
-        header.append_u8(SIG_COMPRESSED)
+        if self.compressed:
+            header.append_u8(SIG_COMPRESSED)
+        else:
+            header.append_u8(SIG_UNCOMPRESSED)
        header.append_u8(encoding_vals[self.encoding])
        # Python's ints are big, so can't just bitwise invert
        header.append_u8(0xFF ^ encoding_vals[self.encoding])
@ -202,14 +323,15 @@ class KBinXML():
        self._node_to_binary(self.xml_doc)

        # always has the isArray bit set
-        self.nodeBuf.append_u8(xml_types['endSection'] | 64)
+        self.nodeBuf.append_u8(xml_types["endSection"] | 64)
        self.nodeBuf.realign_writes()
        header.append_u32(len(self.nodeBuf))
-        self.nodeBuf.append_u32(len(self.dataBuf))
+        self.dataSize = len(self.dataBuf)
+        self.nodeBuf.append_u32(self.dataSize)
        return bytes(header.data + self.nodeBuf.data + self.dataBuf.data)

    def from_binary(self, input):
-        self.xml_doc = etree.Element('root')
+        self.xml_doc = etree.Element("root")
        node = self.xml_doc

        self.nodeBuf = ByteBuffer(input)
@ -227,7 +349,7 @@ class KBinXML():
        self.nodeBuf.end = nodeEnd

        self.dataBuf = ByteBuffer(input, nodeEnd)
-        dataSize = self.dataBuf.get_u32()
+        self.dataSize = self.dataBuf.get_u32()
        # This is all no fun
        self.dataByteBuf = ByteBuffer(input, nodeEnd)
        self.dataWordBuf = ByteBuffer(input, nodeEnd)
@ -242,88 +364,130 @@ class KBinXML():
            isArray = nodeType & 64
            nodeType &= ~64

-            nodeFormat = xml_formats.get(nodeType, {'name':'Unknown'})
-            debug_print('Node type is {} ({})'.format(nodeFormat['name'], nodeType))
+            nodeFormat = xml_formats.get(nodeType, {"name": "Unknown"})
+            debug_print("Node type is {} ({})".format(nodeFormat["name"], nodeType))

            # node or attribute name
-            name = ''
-            if nodeType != xml_types['nodeEnd'] and nodeType != xml_types['endSection']:
+            name = ""
+            if nodeType != xml_types["nodeEnd"] and nodeType != xml_types["endSection"]:
                if self.compressed:
                    name = unpack_sixbit(self.nodeBuf)
                else:
                    length = (self.nodeBuf.get_u8() & ~64) + 1
-                    name = self.nodeBuf.get('B', length)
+                    name = self.nodeBuf.get_bytes(length)
                    name = bytes(name).decode(self.encoding)
                debug_print(name)

            skip = True

-            if nodeType == xml_types['attr']:
+            if nodeType == xml_types["attr"]:
                value = self.data_grab_string()
-                node.attrib[name] = value
-            elif nodeType == xml_types['nodeEnd']:
+                # because someone thought it was a good idea to serialise namespaces
+                if name.startswith("xmlns:"):
+                    _, name = name.split("xmlns:")
+                    node = self._add_namespace(node, name, value)
+                elif ":" in name:
+                    prefix, name = name.split(":")
+                    # if this fails, the xml is invalid. Open an issue.
+                    node.set(etree.QName(node.nsmap[prefix], name), value)
+                # this is the case you'll get in 99% of places
+                else:
+                    node.attrib[name] = value
+            elif nodeType == xml_types["nodeEnd"]:
                if node.getparent() is not None:
                    node = node.getparent()
-            elif nodeType == xml_types['endSection']:
+            elif nodeType == xml_types["endSection"]:
                nodesLeft = False
            elif nodeType not in xml_formats:
-                raise NotImplementedError('Implement node {}'.format(nodeType))
-            else: # inner value to process
+                raise NotImplementedError("Implement node {}".format(nodeType))
+            else:  # inner value to process
                skip = False

            if skip:
                continue

-            child = etree.SubElement(node, name)
+            try:
+                child = etree.SubElement(node, name)
+            except ValueError as e:
+                fixed_name = f"_{name}"
+                if self.convert_illegal_things:
+                    # todo: there are other invalid node names. Fix them when you see them.
+                    child = etree.SubElement(node, fixed_name)
+                else:
+                    raise KBinException(
+                        f'Could not create node with name "{name}". To rename it to "{fixed_name}", {convert_illegal_help}.'
+                    ) from e
            node = child

-            if nodeType == xml_types['nodeStart']:
+            if nodeType == xml_types["nodeStart"]:
                continue

-            node.attrib['__type'] = nodeFormat['name']
+            node.attrib["__type"] = nodeFormat["name"]

-            varCount = nodeFormat['count']
+            varCount = nodeFormat["count"]
            arrayCount = 1
-            if varCount == -1: # the 2 cannot be combined
+            if varCount == -1:  # the 2 cannot be combined
                varCount = self.dataBuf.get_u32()
                isArray = True
            elif isArray:
-                arrayCount = self.dataBuf.get_u32() // (calcsize(nodeFormat['type'] * varCount))
-                node.attrib['__count'] = str(arrayCount)
+                arrayCount = self.dataBuf.get_u32() // (
+                    calcsize(nodeFormat["type"] * varCount)
+                )
+                node.attrib["__count"] = str(arrayCount)
            totalCount = arrayCount * varCount

            if isArray:
-                data = self.dataBuf.get(nodeFormat['type'], totalCount)
+                data = self.dataBuf.get(nodeFormat["type"], totalCount)
                self.dataBuf.realign_reads()
            else:
-                data = self.data_grab_aligned(nodeFormat['type'], totalCount)
+                data = self.data_grab_aligned(nodeFormat["type"], totalCount)

-            if nodeType == xml_types['binary']:
-                node.attrib['__size'] = str(totalCount)
-                string = ''.join(('{0:02x}'.format(x) for x in data))
-            elif nodeType == xml_types['string']:
+            if nodeType == xml_types["binary"]:
+                node.attrib["__size"] = str(totalCount)
+                string = "".join(("{0:02x}".format(x) for x in data))
+            elif nodeType == xml_types["string"]:
                string = bytes(data[:-1]).decode(self.encoding)
            else:
-                string = ' '.join(map(nodeFormat.get('toStr', str), data))
+                string = " ".join(map(nodeFormat.get("toStr", str), data))

-            node.text = string
+            # some strings have extra NUL bytes, compatible behaviour is to strip
+            node.text = string.strip("\0")

        # because we need the 'real' root
        self.xml_doc = self.xml_doc[0]

-def main():
-    if len(sys.argv) != 2:
-        print('bin_xml.py file.[xml/bin]')
-        exit()

-    with open(sys.argv[1], 'rb') as f:
+convert_illegal_help = "set convert_illegal_things=True in the KBinXML constructor"
+
+
+def main():
+    # interestingly, this doesn't work if added inside the
+    # `if __name__ == "__main__"` branch
+    global convert_illegal_help
+    convert_illegal_help = "add the --convert-illegal flag"
+
+    parser = argparse.ArgumentParser(
+        prog="kbinxml", description="Convert kbin to xml, or xml to kbin"
+    )
+    parser.add_argument("filename", metavar="file.[xml/bin]")
+    parser.add_argument("--convert-illegal", action="store_true")
+
+    args = parser.parse_args()
+
+    with open(args.filename, "rb") as f:
        input = f.read()

-    xml = KBinXML(input)
-    if KBinXML.is_binary_xml(input):
-        stdout.write(xml.to_text().encode('utf-8'))
-    else:
-        stdout.write(xml.to_binary())
+    xml = KBinXML(input, convert_illegal_things=args.convert_illegal)
+    stdout = getattr(sys.stdout, "buffer", sys.stdout)
+    try:
+        if KBinXML.is_binary_xml(input):
+            stdout.write(xml.to_text().encode("utf-8"))
+        else:
+            stdout.write(xml.to_binary())
+    except BrokenPipeError:
+        # allows kbinxml to be piped to `head` or similar
+        sys.exit(141)

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    main()
--- a/kbinxml/sixbit.py
+++ b/kbinxml/sixbit.py
@ -1,56 +1,36 @@
-# python 3 style, ints instead of b''
-from builtins import bytes
-from bitarray import bitarray
+from kbinxml.bytebuffer import ByteBuffer

-def pack_sixbit(string, byteBuf):
-    chars = str_to_sixbit(string)
-    bits = bitarray(endian='big')
+
+charmap = "0123456789:ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz"
+bytemap = {c: i for i, c in enumerate(charmap)}
+
+
+def pack_sixbit(string: str, byteBuf: ByteBuffer):
+    chars = [bytemap[x] for x in string]
+    padding = 8 - (len(string) * 6 % 8)
+    if padding == 8:
+        padding = 0
+    bits = 0
    for c in chars:
-        bits.frombytes(c.encode())
-        # leave only the 6 bits we care for
-        del bits[-8:-6]
-    data = bytes(bits.tobytes())
-    byteBuf.append_u8(len(string))
-    byteBuf.append(data, 'B', len(data))
+        bits <<= 6
+        bits |= c
+    bits <<= padding
+    data = bits.to_bytes((len(string) * 6 + padding) // 8, byteorder="big")
+    byteBuf.append_bytes((len(string),))
+    byteBuf.append_bytes(data)

-def unpack_sixbit(byteBuf):
+
+def unpack_sixbit(byteBuf: ByteBuffer):
    length = byteBuf.get_u8()
-    length_bytes = (length * 6 + 7) // 8
-    bitBuf = bitarray(endian='big')
-    bitBuf.frombytes(bytes(byteBuf.get('B', length_bytes)))
+    length_bits = length * 6
+    length_bytes = (length_bits + 7) // 8
+    padding = 8 - (length_bits % 8)
+    if padding == 8:
+        padding = 0
+    bits = int.from_bytes(byteBuf.get_bytes(length_bytes), byteorder="big")
+    bits >>= padding
    result = []
-    offset = 0
-    for i in range(length):
-        result.append(ord(bitBuf[offset:offset+6].tobytes()) >> (8 - 6))
-        offset += 6
-    return sixbit_to_str(result)
-
-# 0-9 for numbers, 10 is ':', 11 to 36 for capitals, 37 for underscore, 38-63 for lowercase
-def sixbit_to_str(decompressed):
-    string = ''
-    for d in decompressed:
-        if d <= 10:
-            d += ord('0')
-        elif d < 37:
-            d += 54
-        elif d == 37:
-            d += 58
-        else:
-            d += 59
-        string += chr(d)
-    return string
-
-def str_to_sixbit(string):
-    compress = []
-    for c in string:
-        if c >= '0' and c <= ':':
-            compress.append(ord(c) - ord('0'))
-        elif c >= 'A' and c <= 'Z':
-            compress.append(ord(c) - 54)
-        elif c == '_':
-            compress.append(ord(c) - 58)
-        elif c >= 'a' and c <= 'z':
-            compress.append(ord(c) - 59)
-        else:
-            raise ValueError('Node or attribute name can only contain alphanumeric + underscore')
-    return ''.join(map(chr, compress))
+    for _ in range(length):
+        result.append(bits & 0b111111)
+        bits >>= 6
+    return "".join([charmap[x] for x in result[::-1]])
--- a/kbinxml/test.py
+++ b/kbinxml/test.py
@ -1,28 +1,26 @@
 from .kbinxml import KBinXML
-# python 2/3 cross compat
-from io import open

-with open('testcases.xml', 'rb') as f:
+with open("testcases.xml", "rb") as f:
    xml_in = f.read()
-with open('testcases_out.xml', 'r', encoding='UTF-8') as f:
+with open("testcases_out.xml", "r", encoding="UTF-8") as f:
    expected_xml = f.read()
-with open('testcases_out.kbin', 'rb') as f:
+with open("testcases_out.kbin", "rb") as f:
    expected_bin = f.read()

 k = KBinXML(xml_in)
 kbin = k.to_binary()
 if kbin != expected_bin:
-    with open('failed_test.kbin', 'wb') as f:
+    with open("failed_test.kbin", "wb") as f:
        f.write(kbin)
-    raise AssertionError('Binary output does not match, check failed_test.kbin')
+    raise AssertionError("Binary output does not match, check failed_test.kbin")
 else:
-    print('XML -> Binary correct!')
+    print("XML -> Binary correct!")

 backwards = KBinXML(kbin)
 btext = backwards.to_text()
 if btext != expected_xml:
-    with open('failed_test.xml', 'w', encoding='UTF-8') as f:
+    with open("failed_test.xml", "w", encoding="UTF-8") as f:
        f.write(btext)
-    raise AssertionError('XML putput does not match, check failed_test.xml')
+    raise AssertionError("XML putput does not match, check failed_test.xml")
 else:
-    print('Binary -> XML correct!')
+    print("Binary -> XML correct!")
--- a/setup.py
+++ b/setup.py
@ -1,23 +1,24 @@
 from setuptools import setup
-import sys
-

 requires = [
-        'bitarray',
-        'lxml',
+    "lxml",
 ]
-if sys.version_info < (3,0):
-    requires.append('future')

+python_requires = ">=3.10"
+
+version = "2.1"
 setup(
-    name='kbinxml',
-    version='1.1',
-    entry_points = {
-        'console_scripts': ['kbinxml=kbinxml:main'],
+    name="kbinxml",
+    description="Decoder/encoder for Konami's binary XML format",
+    long_description="See Github for up to date documentation",
+    version=version,
+    entry_points={
+        "console_scripts": ["kbinxml=kbinxml:main"],
    },
-    packages=['kbinxml'],
-    url='https://github.com/mon/kbinxml/',
-    author='mon',
-    author_email='me@mon.im',
-    install_requires=requires
+    packages=["kbinxml"],
+    url="https://github.com/mon/kbinxml/",
+    download_url="https://github.com/mon/kbinxml/archive/{}.tar.gz".format(version),
+    author="mon",
+    author_email="me@mon.im",
+    install_requires=requires,
 )
--- a/testcases.xml
+++ b/testcases.xml
@ -8,9 +8,11 @@
    <!-- Testing encoding, plus __type-less should become string -->
    <superstar babe="ミツル">シ　イス　マイ　ワイフ</superstar>
    <!-- Testing 6bit conversion -->
-    <xXx_T4GG3R_xXx __type="3u8">8 9 10</xXx_T4GG3R_xXx>
-    <!-- Shouldn't have alignment issues from the 3u8 -->
+    <xXx_T4GG3R_xXx __type="2u8">8 9</xXx_T4GG3R_xXx>
+    <!-- Shouldn't have alignment issues from the 2u8 -->
    <aligned __type="u8">12</aligned>
+    <!-- Array contents are never packed -->
+    <aligned_arr __type="u8" __count="1">13</aligned_arr>
    <!-- Binary parsing -->
    <entry __type="binary">DEADBEEF</entry>
    <!-- Lowercase, too -->
--- a/testcases_out.kbin
+++ b/testcases_out.kbin
--- a/testcases_out.xml
+++ b/testcases_out.xml
@ -3,8 +3,9 @@
  <entry __type="ip4" __count="2">127.0.0.1 192.168.0.1</entry>
  <entry __type="str" attr="test" attr2="best">Hello, world!</entry>
  <superstar __type="str" babe="ミツル">シ　イス　マイ　ワイフ</superstar>
-  <xXx_T4GG3R_xXx __type="3u8">8 9 10</xXx_T4GG3R_xXx>
+  <xXx_T4GG3R_xXx __type="2u8">8 9</xXx_T4GG3R_xXx>
  <aligned __type="u8">12</aligned>
+  <aligned_arr __type="u8" __count="1">13</aligned_arr>
  <entry __type="bin" __size="4">deadbeef</entry>
  <entry __type="bin" __size="4">deadbe7a</entry>
  <entry __type="3u8" __count="2">1 2 3 1 2 3</entry>
Author	SHA1	Message	Date
William Toohey	1b6a3580cc	Fix BrokenPipeError when piping kbinxml output	2024-04-28 10:54:05 +10:00
William Toohey	b1cb927fd2	Remove py2 support, format code, add type annotations, convert illegal node names	2023-10-26 18:02:31 +10:00
William Toohey	3a7f038c70	Merge branch 'master' of https://github.com/mon/kbinxml	2023-07-13 21:09:24 +10:00
William Toohey	ae7c172820	Be more lenient when decoding malformed strings	2023-07-13 21:09:15 +10:00
Shiz	718d4c8478	testcases: update output kbin for array alignmen test git add -p is deceptive, whoops.	2023-06-03 11:22:20 +10:00
Will	ce3c47325f	Merge pull request #7 from Shizmob/feature/array-alignment-testcase testcases: add array packing/alignment test case	2023-06-03 10:11:54 +10:00
Shiz	48db748632	testcases: add array packing/alignment test case array reads are never taken from the packed buffers, even if the total array size would easily fit in them.	2023-06-02 14:40:21 +02:00
William Toohey	b847de5453	Bump version to 1.7	2021-05-03 22:33:39 +10:00
William Toohey	5707a6bf8b	is_binary_xml: don't explode on too-short inputs	2021-05-03 22:32:27 +10:00
William Toohey	3cbc3179a8	Support namespace deserialisation	2020-07-10 10:23:36 +10:00
William Toohey	b1339d6c4b	Fix strings with extra NUL padding	2020-07-10 09:26:55 +10:00
William Toohey	ca4a6e309e	re-add 0x00 encoding	2018-06-19 16:26:43 +10:00
William Toohey	8784a76672	ISO 8859 was also wrong	2018-06-15 15:58:32 +10:00
William Toohey	68166ea9ab	I was using the wrong encoding the whole time	2018-06-15 15:08:42 +10:00
William Toohey	fed34e0943	Fix python 2 (again)	2018-05-31 15:23:29 +10:00
William Toohey	042323917c	Drop bitarray requirement	2018-05-25 23:59:55 +10:00
William Toohey	dccacdb6e6	Fix python2 (again)	2018-01-30 11:16:21 +10:00
William Toohey	50fa4b8d23	Support writing compressed tag names	2018-01-10 22:24:59 +10:00
William Toohey	a4eb17c70b	Speed improvements	2018-01-09 21:25:56 +10:00
William Toohey	acd3a4195d	PyPi!	2018-01-09 17:05:30 +10:00
William Toohey	1fc6b47484	In-memory size calculation	2018-01-09 16:55:13 +10:00
Will	a77eee91db	Create LICENSE	2018-01-07 22:41:52 +10:00
William Toohey	4e53144e42	Simpler install guide	2018-01-05 01:56:41 +10:00