From 84f6ffd116bc77e04a981b559040be8fc632f76f Mon Sep 17 00:00:00 2001
From: William Toohey <will@mon.im>
Date: Thu, 22 Jun 2017 00:11:28 +1000
Subject: [PATCH] Much needed cleanup/restructure

---
 bin_xml.py    | 214 +++++++++++++++++++++-----------------------------
 bytebuffer.py |   8 +-
 format_ids.py | 116 +++++++++++++--------------
 sixbit.py     |  55 +++++++++++++
 4 files changed, 205 insertions(+), 188 deletions(-)
 create mode 100644 sixbit.py

diff --git a/bin_xml.py b/bin_xml.py
index dd67ad1..6947775 100644
--- a/bin_xml.py
+++ b/bin_xml.py
@@ -1,103 +1,72 @@
 from xml.dom import minidom
 from struct import calcsize
 import string
-from bitarray import bitarray
-from bytebuffer import ByteBuffer
-from format_ids import xml_formats, xml_types
 import sys
+import operator
+
+from bytebuffer import ByteBuffer
+from sixbit import pack_sixbit, unpack_sixbit
+from format_ids import xml_formats, xml_types
 
 DEBUG_OFFSETS = False
 DEBUG = False
 
-SIGNATURE = 0xA042
+SIGNATURE = 0xA0
 
-encodings = [
-    None,
-    'ASCII',
-    'ISO-8859-1',
-    'EUC-JP',
-    'SHIFT_JIS',
-    'UTF-8'
-]
+SIG_COMPRESSED = 0x42
+SIG_UNCOMPRESSED = 0x45
+
+XML_ENCODING = 'UTF_8'
+BIN_ENCODING = 'SHIFT_JISX0213'
+
+# NOTE: all of these are their python codec names
+encoding_strings = {
+    0x20: 'ASCII',
+    0x00: 'ISO-8859-1',
+    0x60: 'EUC_JP',
+    0x80: 'SHIFT_JISX0213',
+    0xA0: 'UTF_8'
+}
+
+encoding_vals = {val : key for key, val in encoding_strings.items()}
 
 def debug_print(string):
     if DEBUG:
         print string
 
-class kbinxml():
+class KBinXML():
 
     def __init__(self, input):
         if isinstance(input, minidom.Document):
             self.xml_doc = input
-        elif self.is_binary_xml(input):
+        elif KBinXML.is_binary_xml(input):
             self.from_binary(input)
         else:
             self.from_text(input)
 
-    def pack_bits(self, string, bits = 6):
-        chars = self.str_to_sixbit(string)
-        bits = bitarray(endian='big')
-        for c in chars:
-            bits.frombytes(c)
-            del bits[-8:-6]
-        for c in bits.tobytes():
-            self.nodeBuf.append_u8(ord(c))
+    def to_text(self):
+        return self.xml_doc.toprettyxml(indent = "    ", encoding = XML_ENCODING)
 
-    def unpack_bits(self, length, bits = 6):
-        result = []
-        offset = self.nodeBuf.offset * 8
-        for i in range(length):
-            result.append(ord(self.nodeBits[offset:offset+bits].tobytes()) >> (8 - bits))
-            offset += bits
-        # padding
-        self.nodeBuf.offset += (length * bits + 7) // 8
-        return self.sixbit_to_str(result)
+    def from_text(self, input):
+        self.xml_doc = minidom.parseString(input)
 
-    # 0-9 for numbers, 10 to 36 for capitals, 37 for underscore, 38-63 for lowercase
-    def sixbit_to_str(self, decompressed):
-        string = ''
-        for d in decompressed:
-            if d <= 10:
-                d += ord('0')
-            elif d < 37:
-                d += 54
-            elif d == 37:
-                d += 58
-            else:
-                d += 59
-            string += chr(d)
-        return string
-
-    def str_to_sixbit(self, string):
-        compress = []
-        for c in string:
-            if c >= '0' and c <= '9':
-                compress.append(ord(c) - ord('0'))
-            elif c >= 'A' and c <= 'Z':
-                compress.append(ord(c) - 54)
-            elif c == '_':
-                compress.append(ord(c) - 58)
-            elif c >= 'a' and c <= 'z':
-                compress.append(ord(c) - 59)
-            else:
-                raise ValueError('Node name can only contain alphanumeric + underscore')
-        return ''.join(map(chr, compress))
+    @staticmethod
+    def is_binary_xml(input):
+        nodeBuf = ByteBuffer(input)
+        return (nodeBuf.get_u8() == SIGNATURE and
+            nodeBuf.get_u8() in (SIG_COMPRESSED, SIG_UNCOMPRESSED))
 
     def data_grab_auto(self):
         size = self.dataBuf.get_s32()
-        ret = [self.dataBuf.get_u8() for x in range(size)]
+        ret = self.dataBuf.get('b', size)
         self.dataBuf.realign_reads()
         return ret
 
     def data_append_auto(self, data):
         self.dataBuf.append_s32(len(data))
-        self.dataBuf.append(data, 's', len(data))
+        self.dataBuf.append(data, 'b', len(data))
         self.dataBuf.realign_writes()
 
-    def data_append_string(self, string):
-        string = string.encode('shift_jisx0213') + '\0'
-        self.data_append_auto(string)
-
     def data_grab_string(self):
         data = self.data_grab_auto()
         res = ''
@@ -105,7 +74,11 @@ class kbinxml():
             if b == 0:
                 break
             res += chr(b)
-        return res.decode('shift_jisx0213')
+        return res.decode(self.encoding)
+
+    def data_append_string(self, string):
+        string = string.encode(self.encoding) + '\0'
+        self.data_append_auto(string)
 
     # has its own separate state and other assorted garbage
     def data_grab_aligned(self, type, count):
@@ -136,7 +109,7 @@ class kbinxml():
         # multiply by count since 2u2 reads from the 16 bit buffer, for example
         size = calcsize(type) * count
         if size == 1:
-            # make room if fresh dword for our stuff
+            # make room for our stuff if fresh dword
             if self.dataByteBuf.offset % 4 == 0:
                 self.dataBuf.append_u32(0)
             self.dataByteBuf.set(data, self.dataByteBuf.offset, type, count)
@@ -148,11 +121,6 @@ class kbinxml():
             self.dataBuf.append(data, type, count)
             self.dataBuf.realign_writes()
 
-
-    def is_binary_xml(self, input):
-        nodeBuf = ByteBuffer(input)
-        return nodeBuf.get_u16() == SIGNATURE
-
     def _node_to_binary(self, node):
         nodeType = node.getAttribute('__type')
         if not nodeType:
@@ -168,18 +136,19 @@ class kbinxml():
         self.nodeBuf.append_u8(nodeId | isArray)
 
         name = node.nodeName
-        self.nodeBuf.append_u8(len(name))
-        self.pack_bits(name)
+        pack_sixbit(name, self.nodeBuf)
 
         if nodeType != 'void':
             fmt = xml_formats[nodeId]
 
             val = node.firstChild.nodeValue
-            if fmt['count'] != -1:
-                val = val.split(fmt.get('delimiter', ' '))
-                data = map(fmt['pType'], val)
+            if fmt['name'] == 'bin':
+                data = bytes(bytearray.fromhex(val))
+            elif fmt['name'] == 'str':
+                data = val.encode(self.encoding) + '\0'
             else:
-                data = fmt['pType'](val)
+                val = val.split(fmt.get('delimiter', ' '))
+                data = map(fmt['pyType'], val)
 
             if isArray or fmt['count'] == -1:
                 self.dataBuf.append_u32(len(data) * calcsize(fmt['type']))
@@ -188,31 +157,30 @@ class kbinxml():
             else:
                 self.data_append_aligned(data, fmt['type'], fmt['count'])
 
-        import operator
-        sorted_x = sorted(node.attributes.items(), key=operator.itemgetter(0))
-        for key, value in sorted_x:#node.attributes.items():
-            if key in ['__type', '__size', '__count']:
-                pass
-            else:
+        # for consistency and to be more faithful
+        sorted_attrs = sorted(node.attributes.items(), key=operator.itemgetter(0))
+        for key, value in sorted_attrs:
+            if key not in ['__type', '__size', '__count']:
                 self.data_append_string(value)
                 self.nodeBuf.append_u8(xml_types['attr'])
-                self.nodeBuf.append_u8(len(key))
-                self.pack_bits(key)
-                
+                pack_sixbit(key, self.nodeBuf)
+
         for child in node.childNodes:
             if child.nodeType != child.TEXT_NODE:
                 self._node_to_binary(child)
 
+        # always has the isArray bit set
         self.nodeBuf.append_u8(xml_types['nodeEnd'] | 64)
 
-    def from_text(self, input):
-        self.xml_doc = minidom.parseString(input)
-
     def to_binary(self):
+        self.encoding = BIN_ENCODING
+
         header = ByteBuffer()
-        header.append_u16(SIGNATURE)
-        header.append_u8(4 << 5) # SHIFT-JIS TODO make encoding variable
-        header.append_u8(0x7F) # TODO what does this do as 7f or ff
+        header.append_u8(SIGNATURE)
+        header.append_u8(SIG_COMPRESSED)
+        header.append_u8(encoding_vals[self.encoding])
+        # Python's ints are big, so can't just bitwise invert
+        header.append_u8(0xFF ^ encoding_vals[self.encoding])
         self.nodeBuf = ByteBuffer()
         self.dataBuf = ByteBuffer()
         self.dataByteBuf = ByteBuffer(self.dataBuf.data)
@@ -221,34 +189,34 @@ class kbinxml():
         for child in self.xml_doc.childNodes:
             self._node_to_binary(child)
 
+        # always has the isArray bit set
         self.nodeBuf.append_u8(xml_types['endSection'] | 64)
         self.nodeBuf.realign_writes()
         header.append_u32(len(self.nodeBuf))
         self.nodeBuf.append_u32(len(self.dataBuf))
         return bytes(header.data + self.nodeBuf.data + self.dataBuf.data)
 
-    def to_text(self):
-        return self.xml_doc.toprettyxml(indent="    ", encoding='UTF-8')
-
     def from_binary(self, input):
         self.xml_doc = minidom.Document()
         node = self.xml_doc
 
         self.nodeBuf = ByteBuffer(input)
-        assert self.nodeBuf.get_u16() == SIGNATURE
-        encoding = encodings[(self.nodeBuf.get_u8() & 0xE0) >> 5]
-        unknown = self.nodeBuf.get_u8()
+        assert self.nodeBuf.get_u8() == SIGNATURE
 
-        # creating bitarrays is slow, cache for speed
-        self.nodeBits = bitarray(endian='big')
-        self.nodeBits.frombytes(input)
+        compress = self.nodeBuf.get_u8()
+        assert compress in (SIG_COMPRESSED, SIG_UNCOMPRESSED)
+        self.compressed = compress == SIG_COMPRESSED
+
+        encoding_key = self.nodeBuf.get_u8()
+        assert self.nodeBuf.get_u8() == 0xFF ^ encoding_key
+        self.encoding = encoding_strings[encoding_key]
 
         nodeEnd = self.nodeBuf.get_u32() + 8
         self.nodeBuf.end = nodeEnd
 
         self.dataBuf = ByteBuffer(input, nodeEnd)
         dataSize = self.dataBuf.get_u32()
-        # WHY MUST YOU DO THIS TO ME
+        # This is all no fun
         self.dataByteBuf = ByteBuffer(input, nodeEnd)
         self.dataWordBuf = ByteBuffer(input, nodeEnd)
 
@@ -265,11 +233,14 @@ class kbinxml():
             nodeFormat = xml_formats.get(nodeType, {'name':'Unknown'})
             debug_print('Node type is {} ({})'.format(nodeFormat['name'], nodeType))
 
-            # node name
+            # node or attribute name
             name = ''
             if nodeType != xml_types['nodeEnd'] and nodeType != xml_types['endSection']:
-                strLen = self.nodeBuf.get_u8()
-                name = self.unpack_bits(strLen)
+                if self.compressed:
+                    name = unpack_sixbit(self.nodeBuf)
+                else:
+                    length = self.nodeBuf.get_u8()
+                    name = self.nodeBuf.get('s', length)
                 debug_print(name)
 
             skip = True
@@ -321,27 +292,20 @@ class kbinxml():
             if nodeType == xml_types['binary']:
                 node.setAttribute('__size', str(totalCount))
                 string = ''.join(('{0:02x}'.format(ord(x)) for x in string))
-            if nodeType == xml_types['string']:
-                string = string[:-1].decode('shift_jisx0213')
+            elif nodeType == xml_types['string']:
+                string = string[:-1].decode(self.encoding)
 
             node.appendChild(self.xml_doc.createTextNode(string))
 
-            #print self.xml_doc.toprettyxml(indent="  ", encoding='UTF-8')
-
 if __name__ == '__main__':
-    if len(sys.argv) < 2:
-        print 'bin_xml.py file1 [file2 ...]'
+    if len(sys.argv) != 2:
+        print 'bin_xml.py file.[xml/bin]'
 
-    # by default, confirm the implementation is correct
-    for f in sys.argv[1:]:
-        with open(f, 'rb') as f:
-            input = f.read()
-        xml = kbinxml(input)
+    with open(sys.argv[1:], 'rb') as f:
+        input = f.read()
+
+    xml = KBinXML(input)
+    if KBinXML.is_binary_xml(input):
         print xml.to_text()
-        try:
-            # just politely ignore the signature since we don't do encoding yet
-            assert xml.to_binary()[4:] == input[4:]
-        except AssertionError:
-            print 'Files do not match!'
-            with open('out.raw', 'wb') as f:
-                f.write(xml.to_binary())
+    else:
+        print xml.to_binary()
diff --git a/bytebuffer.py b/bytebuffer.py
index 1245458..dbbfe95 100644
--- a/bytebuffer.py
+++ b/bytebuffer.py
@@ -2,6 +2,8 @@ from struct import *
 
 class ByteBuffer():
     def __init__(self, input = b'', offset = 0, endian = '>'):
+        # so multiple ByteBuffers can hold on to one set of underlying data
+        # this is useful for writers in multiple locations
         if isinstance(input, bytearray):
             self.data = input
         else:
@@ -26,20 +28,20 @@ class ByteBuffer():
 
     def peek(self, type, count = None):
         fmt = self._format_type(type, count)
-        ret = unpack(fmt, self.data[self.offset:self.offset+calcsize(fmt)])
+        ret = unpack_from(fmt, self.data, self.offset)
         return ret[0] if count is None else ret
 
     def append(self, data, type, count = None):
         fmt = self._format_type(type, count)
         self.offset += calcsize(fmt)
-        if isinstance(data, list):
+        if isinstance(data, list) or isinstance(data, bytes) and type != 's':
             self.data.extend(pack(fmt, *data))
         else:
             self.data.extend(pack(fmt, data))
 
     def set(self, data, offset, type, count = None):
         fmt = self._format_type(type, count)
-        if isinstance(data, list):
+        if isinstance(data, list) or isinstance(data, bytes) and type != 's':
             pack_into(fmt, self.data, offset, *data)
         else:
             pack_into(fmt, self.data, offset, data)
diff --git a/format_ids.py b/format_ids.py
index b3b2559..4767fa5 100644
--- a/format_ids.py
+++ b/format_ids.py
@@ -1,64 +1,60 @@
-
-def jisString(string):
-    return string.encode('shift_jisx0213') + '\0'
-
 xml_formats = {
-    1  : { 'type' : None, 'count' : None, 'pType' : None,  'names' : ['void']},
-    2  : { 'type' : 'b',  'count' : 1,    'pType' : int,   'names' : ['s8']},
-    3  : { 'type' : 'B',  'count' : 1,    'pType' : int,   'names' : ['u8']},
-    4  : { 'type' : 'h',  'count' : 1,    'pType' : int,   'names' : ['s16']},
-    5  : { 'type' : 'H',  'count' : 1,    'pType' : int,   'names' : ['u16']},
-    6  : { 'type' : 'i',  'count' : 1,    'pType' : int,   'names' : ['s32']},
-    7  : { 'type' : 'I',  'count' : 1,    'pType' : int,   'names' : ['u32']},
-    8  : { 'type' : 'q',  'count' : 1,    'pType' : int,   'names' : ['s64']},
-    9  : { 'type' : 'Q',  'count' : 1,    'pType' : int,   'names' : ['u64']},
-    10 : { 'type' : 'c',  'count' : -1,   'pType' : bytearray.fromhex, 'names' : ['bin', 'binary'], 'delimiter' : ''},
-    11 : { 'type' : 's',  'count' : -1,   'pType' : jisString, 'names' : ['str', 'string'], 'delimiter' : ''},
-    12 : { 'type' : 'B',  'count' : 4,    'pType' : int,   'names' : ['ip4'], 'delimiter' : '.'},
-    13 : { 'type' : 'I',  'count' : 1,    'pType' : int,   'names' : ['time']}, # todo: how to print
-    14 : { 'type' : 'f',  'count' : 1,    'pType' : float, 'names' : ['float', 'f']},
-    15 : { 'type' : 'd',  'count' : 1,    'pType' : float, 'names' : ['double', 'd']},
-    16 : { 'type' : 'b',  'count' : 2,    'pType' : int,   'names' : ['2s8']},
-    17 : { 'type' : 'B',  'count' : 2,    'pType' : int,   'names' : ['2u8']},
-    18 : { 'type' : 'h',  'count' : 2,    'pType' : int,   'names' : ['2s16']},
-    19 : { 'type' : 'H',  'count' : 2,    'pType' : int,   'names' : ['2u16']},
-    20 : { 'type' : 'i',  'count' : 2,    'pType' : int,   'names' : ['2s32']},
-    21 : { 'type' : 'I',  'count' : 2,    'pType' : int,   'names' : ['2u32']},
-    22 : { 'type' : 'q',  'count' : 2,    'pType' : int,   'names' : ['2s64', 'vs64']},
-    23 : { 'type' : 'Q',  'count' : 2,    'pType' : int,   'names' : ['2u64', 'vu64']},
-    24 : { 'type' : 'f',  'count' : 2,    'pType' : float, 'names' : ['2f']},
-    25 : { 'type' : 'd',  'count' : 2,    'pType' : float, 'names' : ['2d', 'vd']},
-    26 : { 'type' : 'b',  'count' : 3,    'pType' : int,   'names' : ['3s8']},
-    27 : { 'type' : 'B',  'count' : 3,    'pType' : int,   'names' : ['3u8']},
-    28 : { 'type' : 'h',  'count' : 3,    'pType' : int,   'names' : ['3s16']},
-    29 : { 'type' : 'H',  'count' : 3,    'pType' : int,   'names' : ['3u16']},
-    30 : { 'type' : 'i',  'count' : 3,    'pType' : int,   'names' : ['3s32']},
-    31 : { 'type' : 'I',  'count' : 3,    'pType' : int,   'names' : ['3u32']},
-    32 : { 'type' : 'q',  'count' : 3,    'pType' : int,   'names' : ['3s64']},
-    33 : { 'type' : 'Q',  'count' : 3,    'pType' : int,   'names' : ['3u64']},
-    34 : { 'type' : 'f',  'count' : 3,    'pType' : float, 'names' : ['3f']},
-    35 : { 'type' : 'd',  'count' : 3,    'pType' : float, 'names' : ['3d']},
-    36 : { 'type' : 'b',  'count' : 4,    'pType' : int,   'names' : ['4s8']},
-    37 : { 'type' : 'B',  'count' : 4,    'pType' : int,   'names' : ['4u8']},
-    38 : { 'type' : 'h',  'count' : 4,    'pType' : int,   'names' : ['4s16']},
-    39 : { 'type' : 'H',  'count' : 4,    'pType' : int,   'names' : ['4u16']},
-    40 : { 'type' : 'i',  'count' : 4,    'pType' : int,   'names' : ['4s32', 'vs32']},
-    41 : { 'type' : 'I',  'count' : 4,    'pType' : int,   'names' : ['4u32', 'vu32']},
-    42 : { 'type' : 'q',  'count' : 4,    'pType' : int,   'names' : ['4s64']},
-    43 : { 'type' : 'Q',  'count' : 4,    'pType' : int,   'names' : ['4u64']},
-    44 : { 'type' : 'f',  'count' : 4,    'pType' : float, 'names' : ['4f', 'vf']},
-    45 : { 'type' : 'd',  'count' : 4,    'pType' : float, 'names' : ['4d']},
-    46 : { 'type' : None, 'count' : None, 'pType' : None,  'names' : ['attr']},
-    #47 : { 'type' : None, 'count' : None, 'pType' : None,  'names' : ['array']},
-    48 : { 'type' : 'b',  'count' : 16,   'pType' : int,   'names' : ['vs8']},
-    49 : { 'type' : 'B',  'count' : 16,   'pType' : int,   'names' : ['vu8']},
-    50 : { 'type' : 'h',  'count' : 8,    'pType' : int,   'names' : ['vs16']},
-    51 : { 'type' : 'H',  'count' : 8,    'pType' : int,   'names' : ['vu16']},
-    52 : { 'type' : 'b',  'count' : 1,    'pType' : int,   'names' : ['bool', 'b']},
-    53 : { 'type' : 'b',  'count' : 2,    'pType' : int,   'names' : ['2b']},
-    54 : { 'type' : 'b',  'count' : 3,    'pType' : int,   'names' : ['3b']},
-    55 : { 'type' : 'b',  'count' : 4,    'pType' : int,   'names' : ['4b']},
-    56 : { 'type' : 'b',  'count' : 16,   'pType' : int,   'names' : ['vb']}
+    1  : { 'type' : None, 'count' : None, 'pyType' : None,  'names' : ['void']},
+    2  : { 'type' : 'b',  'count' : 1,    'pyType' : int,   'names' : ['s8']},
+    3  : { 'type' : 'B',  'count' : 1,    'pyType' : int,   'names' : ['u8']},
+    4  : { 'type' : 'h',  'count' : 1,    'pyType' : int,   'names' : ['s16']},
+    5  : { 'type' : 'H',  'count' : 1,    'pyType' : int,   'names' : ['u16']},
+    6  : { 'type' : 'i',  'count' : 1,    'pyType' : int,   'names' : ['s32']},
+    7  : { 'type' : 'I',  'count' : 1,    'pyType' : int,   'names' : ['u32']},
+    8  : { 'type' : 'q',  'count' : 1,    'pyType' : int,   'names' : ['s64']},
+    9  : { 'type' : 'Q',  'count' : 1,    'pyType' : int,   'names' : ['u64']},
+    10 : { 'type' : 'c',  'count' : -1,   'pyType' : None, 'names' : ['bin', 'binary'], 'delimiter' : ''},
+    11 : { 'type' : 's',  'count' : -1,   'pyType' : None, 'names' : ['str', 'string'], 'delimiter' : ''},
+    12 : { 'type' : 'B',  'count' : 4,    'pyType' : int,   'names' : ['ip4'], 'delimiter' : '.'},
+    13 : { 'type' : 'I',  'count' : 1,    'pyType' : int,   'names' : ['time']}, # todo: how to print
+    14 : { 'type' : 'f',  'count' : 1,    'pyType' : float, 'names' : ['float', 'f']},
+    15 : { 'type' : 'd',  'count' : 1,    'pyType' : float, 'names' : ['double', 'd']},
+    16 : { 'type' : 'b',  'count' : 2,    'pyType' : int,   'names' : ['2s8']},
+    17 : { 'type' : 'B',  'count' : 2,    'pyType' : int,   'names' : ['2u8']},
+    18 : { 'type' : 'h',  'count' : 2,    'pyType' : int,   'names' : ['2s16']},
+    19 : { 'type' : 'H',  'count' : 2,    'pyType' : int,   'names' : ['2u16']},
+    20 : { 'type' : 'i',  'count' : 2,    'pyType' : int,   'names' : ['2s32']},
+    21 : { 'type' : 'I',  'count' : 2,    'pyType' : int,   'names' : ['2u32']},
+    22 : { 'type' : 'q',  'count' : 2,    'pyType' : int,   'names' : ['2s64', 'vs64']},
+    23 : { 'type' : 'Q',  'count' : 2,    'pyType' : int,   'names' : ['2u64', 'vu64']},
+    24 : { 'type' : 'f',  'count' : 2,    'pyType' : float, 'names' : ['2f']},
+    25 : { 'type' : 'd',  'count' : 2,    'pyType' : float, 'names' : ['2d', 'vd']},
+    26 : { 'type' : 'b',  'count' : 3,    'pyType' : int,   'names' : ['3s8']},
+    27 : { 'type' : 'B',  'count' : 3,    'pyType' : int,   'names' : ['3u8']},
+    28 : { 'type' : 'h',  'count' : 3,    'pyType' : int,   'names' : ['3s16']},
+    29 : { 'type' : 'H',  'count' : 3,    'pyType' : int,   'names' : ['3u16']},
+    30 : { 'type' : 'i',  'count' : 3,    'pyType' : int,   'names' : ['3s32']},
+    31 : { 'type' : 'I',  'count' : 3,    'pyType' : int,   'names' : ['3u32']},
+    32 : { 'type' : 'q',  'count' : 3,    'pyType' : int,   'names' : ['3s64']},
+    33 : { 'type' : 'Q',  'count' : 3,    'pyType' : int,   'names' : ['3u64']},
+    34 : { 'type' : 'f',  'count' : 3,    'pyType' : float, 'names' : ['3f']},
+    35 : { 'type' : 'd',  'count' : 3,    'pyType' : float, 'names' : ['3d']},
+    36 : { 'type' : 'b',  'count' : 4,    'pyType' : int,   'names' : ['4s8']},
+    37 : { 'type' : 'B',  'count' : 4,    'pyType' : int,   'names' : ['4u8']},
+    38 : { 'type' : 'h',  'count' : 4,    'pyType' : int,   'names' : ['4s16']},
+    39 : { 'type' : 'H',  'count' : 4,    'pyType' : int,   'names' : ['4u16']},
+    40 : { 'type' : 'i',  'count' : 4,    'pyType' : int,   'names' : ['4s32', 'vs32']},
+    41 : { 'type' : 'I',  'count' : 4,    'pyType' : int,   'names' : ['4u32', 'vu32']},
+    42 : { 'type' : 'q',  'count' : 4,    'pyType' : int,   'names' : ['4s64']},
+    43 : { 'type' : 'Q',  'count' : 4,    'pyType' : int,   'names' : ['4u64']},
+    44 : { 'type' : 'f',  'count' : 4,    'pyType' : float, 'names' : ['4f', 'vf']},
+    45 : { 'type' : 'd',  'count' : 4,    'pyType' : float, 'names' : ['4d']},
+    46 : { 'type' : None, 'count' : None, 'pyType' : None,  'names' : ['attr']},
+    #47 : { 'type' : None, 'count' : None, 'pyType' : None,  'names' : ['array']},
+    48 : { 'type' : 'b',  'count' : 16,   'pyType' : int,   'names' : ['vs8']},
+    49 : { 'type' : 'B',  'count' : 16,   'pyType' : int,   'names' : ['vu8']},
+    50 : { 'type' : 'h',  'count' : 8,    'pyType' : int,   'names' : ['vs16']},
+    51 : { 'type' : 'H',  'count' : 8,    'pyType' : int,   'names' : ['vu16']},
+    52 : { 'type' : 'b',  'count' : 1,    'pyType' : int,   'names' : ['bool', 'b']},
+    53 : { 'type' : 'b',  'count' : 2,    'pyType' : int,   'names' : ['2b']},
+    54 : { 'type' : 'b',  'count' : 3,    'pyType' : int,   'names' : ['3b']},
+    55 : { 'type' : 'b',  'count' : 4,    'pyType' : int,   'names' : ['4b']},
+    56 : { 'type' : 'b',  'count' : 16,   'pyType' : int,   'names' : ['vb']}
 }
 
 # little less boilerplate for writing
diff --git a/sixbit.py b/sixbit.py
new file mode 100644
index 0000000..d557790
--- /dev/null
+++ b/sixbit.py
@@ -0,0 +1,55 @@
+from bitarray import bitarray
+
+def pack_sixbit(string, byteBuf):
+    chars = str_to_sixbit(string)
+    bits = bitarray(endian='big')
+    for c in chars:
+        bits.frombytes(c)
+        # leave only the 6 bits we care for
+        del bits[-8:-6]
+    data = bits.tobytes()
+    byteBuf.append_u8(len(string))
+    byteBuf.append(data, 'c', len(data))
+
+def unpack_sixbit(byteBuf):
+    bitBuf = bitarray(endian='big')
+    bitBuf.frombytes(bytes(byteBuf.data))
+    length = byteBuf.get_u8()
+    result = []
+    offset = byteBuf.offset * 8
+    for i in range(length):
+        result.append(ord(bitBuf[offset:offset+6].tobytes()) >> (8 - 6))
+        offset += 6
+    # padding
+    byteBuf.offset += (length * 6 + 7) // 8
+    return sixbit_to_str(result)
+
+# 0-9 for numbers, 10 to 36 for capitals, 37 for underscore, 38-63 for lowercase
+def sixbit_to_str(decompressed):
+    string = ''
+    for d in decompressed:
+        if d <= 10:
+            d += ord('0')
+        elif d < 37:
+            d += 54
+        elif d == 37:
+            d += 58
+        else:
+            d += 59
+        string += chr(d)
+    return string
+
+def str_to_sixbit(string):
+    compress = []
+    for c in string:
+        if c >= '0' and c <= '9':
+            compress.append(ord(c) - ord('0'))
+        elif c >= 'A' and c <= 'Z':
+            compress.append(ord(c) - 54)
+        elif c == '_':
+            compress.append(ord(c) - 58)
+        elif c >= 'a' and c <= 'z':
+            compress.append(ord(c) - 59)
+        else:
+            raise ValueError('Node or attribute name can only contain alphanumeric + underscore')
+    return ''.join(map(chr, compress))