[PATCH 02 of 22] radixlink: introduce a new data structure

Sun Jun 4 19:59:14 EDT 2017

# HG changeset patch
# User Jun Wu <quark at fb.com>
# Date 1496536858 25200
#      Sat Jun 03 17:40:58 2017 -0700
# Node ID e8e8d713e4b774f6894ee65723e7fdc12bf8a101
# Parent  5c3500de0a229d8aa08058bccec531d8a85b90d9
# Available At https://bitbucket.org/quark-zju/hg-draft
#              hg pull https://bitbucket.org/quark-zju/hg-draft -r e8e8d713e4b7
radixlink: introduce a new data structure

The radixlink structure is designed to address several hard-to-solve
performance issues in Mercurial. It is an on-disk base16 radix-tree, linking
to a linked list of integers. It's a "multimap<string, list<uint32_t>>" in
C++ term.

For long, core Mercurial has no real key-value data structure supporting
efficient lookups, and various operations have suboptimal time complexity:

  1. changelog.rev(node): O(len(repo))
  2. obsstore.precursors[node]: O(len(markers))
  3. adjustlinkrev slow path (a complex problem)

The new data structure could make the first two O(len(node)), and provide a
cache for linkrevs where the keys could be "filenode + path" (or another
mapping could be used to map path to short bytes).

Note: we already have a radix tree implementation in revlog.c. However it is
coupled with CPython API and uses in-memory pointers, making it harder to be
reused. It seems to be easier to just re-invent an on-disk version.

diff --git a/mercurial/pure/radixlink.py b/mercurial/pure/radixlink.py
new file mode 100644
--- /dev/null
+++ b/mercurial/pure/radixlink.py
@@ -0,0 +1,236 @@
+# radixlink.py - on-disk radixtree index pointing to linked list data
+#
+# Copyright 2017 Facebook, Inc.
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+from __future__ import absolute_import
+
+import struct
+
+def _enlarge(buf, size):
+    """enlarge a bytearray to at least given size"""
+    l = len(buf)
+    if l < size:
+        buf += bytearray(size - l)
+
+def _tob16(buf):
+    """convert buffer to base16 bytearray"""
+    result = bytearray(len(buf) * 2)
+    for i, b in enumerate(bytearray(buf)):
+        result[i * 2] = b >> 4
+        result[i * 2 + 1] = b & 0xf
+    return result
+
+class radixlink(object):
+    """multimap. keys are variable-length bytes, values are lists of uint32s.
+
+    O(len(key)) lookup and insertion.
+
+    A "radixlink" consists of 2 logical parts: index, and link. "index" is a
+    base16 radix tree with pointers to link. "link" stores linked lists.
+
+    The actual implementation mixes the above 2 parts into 1 buffer for
+    simplicity and easy transaction handling. The format of the buffer is:
+
+        radixlink     := header + entry-list
+        header        := source-of-truth-size (4B)
+        entry-list    := radix-entry | entry-list + entry
+        entry         := index-entry | link-entry
+        index-entry   := radix-entry | leaf-entry
+        radix-entry   := '\0' * 4 + link-offset (4B) + index-offset (4B) * 16
+        leaf-entry    := key-length (4B) + link-offset (4B) + key + padding
+        link-entry    := next-link-offset (4B) + value (4B)
+
+        leaf-entry's key-length cannot be 0, and is at least len(radix-entry)
+        long so it can be converted to radix-entry in-place later.
+
+    radixlink should be used as a cache, whose source of truth should be an
+    append-only data structure. The header in indexfile consists a number
+    about the size (physically, or logically like revision number) of
+    append-only source of truth used to build radixlink. This makes it easy to
+    update radixlink incrementally.
+
+    radixlink files are expected living in .hg/cache. It's designed to be
+    simple so it does not even have a version header. Shall an incompatible
+    format change happens, we can use different file names in .hg/cache.
+    """
+
+    sourcelen = struct.Struct('>I')
+    headersize = sourcelen.size
+
+    indexoffset = struct.Struct('>I')
+    linkoffset = indexoffset
+
+    keylen = struct.Struct('>I')
+    radixentrysize = keylen.size + linkoffset.size + indexoffset.size * 16
+
+    linkentry = struct.Struct('>II')
+
+    emptyindex = b'\0' * (headersize + radixentrysize)
+
+    def __init__(self, data=None):
+        self.data = data
+
+    def __len__(self):
+        """raw buffer length, could be used to do "isdirty" check"""
+        return len(self._data)
+
+    @property
+    def data(self):
+        """copy of internal state useful for writing to filesystem"""
+        return bytes(self._data)
+
+    @data.setter
+    def data(self, newdata):
+        """replace internal state, useful for loading from filesystem"""
+        self._data = bytearray(newdata or self.emptyindex)
+
+    @property
+    def sourceoftruthsize(self):
+        """bytes of the append-only source of truth used to update the index"""
+        return self.sourcelen.unpack_from(self._data)[0]
+
+    @sourceoftruthsize.setter
+    def sourceoftruthsize(self, newsize):
+        if self.sourceoftruthsize == newsize:
+            return
+        self.sourcelen.pack_into(self._data, 0, newsize)
+
+    def _get(self, key):
+        """lookup a key, return the value list, or an empty array"""
+        ioff = self.headersize
+        bkey = _tob16(key)
+        boff = 0
+        while boff <= len(bkey):
+            ioff, boff, _poff = self._followindex(ioff, bkey, boff)
+            if ioff == 0:
+                return []
+        _ioff, loff = self._readindexlinkoffset(ioff)
+        return self._readlinkvalues(loff)
+
+    def __contains__(self, key):
+        return bool(self._get(key))
+
+    def __getitem__(self, key):
+        v = self._get(key)
+        if not v:
+            raise KeyError(key)
+        return v
+
+    def insert(self, key, value):
+        """D[key (bytes)] = [value (uint32)] + D.get(key, [])"""
+        ioff = self.headersize
+        bkey = _tob16(key)
+        boff = 0
+        data = self._data
+        while boff <= len(bkey):
+            ioff, boff, poff = self._followindex(ioff, bkey, boff, True)
+            if ioff == 0:
+                # create new leaf
+                assert poff != 0
+                ioff = self._appendleafentry(bkey[boff:])
+                self.indexoffset.pack_into(data, poff, ioff)
+                break
+
+        # update linked list
+        ioff, loff = self._readindexlinkoffset(ioff)
+        newloff = self._appendlinkentry(value, loff)
+        self.linkoffset.pack_into(data, ioff, newloff)
+
+    def _followindex(self, offset, b16list, b16offset, splitleaf=False):
+        """follow an index-entry at offset
+
+        For a radix-entry, lookup b16list[b16offset], and return new pointers.
+        For a leaf-entry, check the remaining of b16list.
+
+        If splitleaf is True, split a leaf-entry where b16list does not match.
+        Useful for adding new index-entries.
+
+        return (nextoffset, b16offset, poffset) or raise. poffset is the offset
+        of nextoffset. Useful for being rewritten to newly added index-entry.
+        """
+        data = self._data
+        klen = self.keylen.unpack_from(self._data, offset)[0]
+        koff = offset + self.keylen.size + self.linkoffset.size
+        if klen > 0: # leaf entry
+            if data[koff:koff + klen] == b16list[b16offset:]:
+                return (offset, len(b16list) + 1, 0)
+            if not splitleaf:
+                return (0, b16offset, 0)
+            self._splitleafentry(offset)
+        # radix entry
+        klen = self.keylen.unpack_from(self._data, offset)[0]
+        assert klen == 0
+        if b16offset >= len(b16list):
+            # no need to follow
+            return (offset, b16offset + 1, 0)
+        b16 = b16list[b16offset]
+        poffset = koff + b16 * self.indexoffset.size
+        nextoffset = self.indexoffset.unpack_from(data, poffset)[0]
+        return (nextoffset, b16offset + 1, poffset)
+
+    def _readindexlinkoffset(self, ioffset):
+        """read an index-entry, return (index-offset, link-offset)"""
+        ioffset += self.keylen.size
+        loffset = self.linkoffset.unpack_from(self._data, ioffset)[0]
+        return (ioffset, loffset)
+
+    def _readlinkvalues(self, loffset):
+        """return list of values (uint32)"""
+        result = []
+        data = self._data
+        while loffset != 0:
+            loffset, value = self.linkentry.unpack_from(data, loffset)
+            result.append(value)
+        return result
+
+    def _appendleafentry(self, bkey, linkoffset=0):
+        """append a leaf-entry, return offset of that entry
+
+        when len(bkey) is 0, append an empty radix-entry instead.
+        """
+        data = self._data
+        offset = len(data)
+        buf = bytearray()
+        buf += self.keylen.pack(len(bkey))
+        buf += self.linkoffset.pack(linkoffset)
+        buf += bkey
+        # leaf-entry is at least len(radix-entry) long
+        _enlarge(buf, self.radixentrysize)
+        data += buf
+        return offset
+
+    def _appendlinkentry(self, value, nextoffset=0):
+        """append a link entry, return its offset"""
+        linkdata = self._data
+        offset = len(linkdata)
+        entry = self.linkentry.pack(nextoffset, value)
+        linkdata += entry
+        return offset
+
+    def _splitleafentry(self, offset):
+        """split leaf-entry into a radix-entry and another leaf-entry
+
+        return offset of the new leaf entry.
+        """
+        data = self._data
+        klen = self.keylen.unpack_from(data, offset)[0]
+        assert klen > 0 # must be a leaf
+
+        koff = offset + self.keylen.size
+        loff = self.linkoffset.unpack_from(data, koff)[0]
+        koff += self.linkoffset.size
+        key = data[koff:koff + klen]
+
+        # create a new leaf entry
+        newoffset = self._appendleafentry(key[1:], loff)
+
+        # rewrite entry to radix in-place
+        radixbuf = bytearray(self.radixentrysize)
+        self.indexoffset.pack_into(radixbuf, self.linkoffset.size +
+                                   self.keylen.size + key[0] *
+                                   self.indexoffset.size, newoffset)
+        data[offset:offset + len(radixbuf)] = radixbuf
+        return newoffset
+
diff --git a/tests/test-radixlink.py b/tests/test-radixlink.py
new file mode 100644
--- /dev/null
+++ b/tests/test-radixlink.py
@@ -0,0 +1,198 @@
+from __future__ import absolute_import
+
+import random
+import unittest
+
+from mercurial import (
+    policy,
+    util,
+)
+
+import silenttestrunner
+
+# dumpradixlink need some struct.Struct definitions from pure
+from mercurial.pure import radixlink as pureradixlink
+
+# main test respects modulepolicy
+radixlink = policy.importmod(r'radixlink')
+
+randint = random.randint
+
+# conservative LONG_MAX (minimal possible value defined by Python)
+longmax = 0x7fffffff
+
+# keys for manual test - being prefix of other keys is more interesting
+interestingkeys = ['abc', 'a', '', 'abcde', 'abcdg', 'abcd', 'abcxy', 'abcyz',
+                   'b', 'xyz']
+
+# expected content dump for testfixedcase
+expectedfixedcase = '''
+    4 | link 1324 | radix 6: 76 7: 1228
+   76 | link    0 | radix 1: 156 2: 1148
+  148 | link    0 | value 0
+  156 | link 1316 | radix 6: 228
+  228 | link    0 | radix 2: 316
+  300 | link    0 | value 1
+  308 | link    0 | value 2
+  316 | link    0 | radix 6: 388
+  388 | link    0 | radix 3: 460
+  460 | link 1308 | radix 6: 532 7: 916
+  532 | link    0 | radix 4: 612
+  604 | link    0 | value 3
+  612 | link 1348 | radix 6: 684
+  684 | link    0 | radix 5: 756 7: 828
+  756 | link 1332 | radix
+  828 | link 1340 | radix
+  900 | link    0 | value 4
+  908 | link    0 | value 5
+  916 | link    0 | radix 8: 996 9: 1068
+  988 | link    0 | value 6
+  996 | link 1356 | leaf  [7, 9]
+ 1068 | link 1364 | leaf  [7, 10]
+ 1140 | link    0 | value 7
+ 1148 | link 1372 | radix
+ 1220 | link    0 | value 8
+ 1228 | link 1380 | leaf  [8, 7, 9, 7, 10]
+ 1300 | link    0 | value 9
+ 1308 | link  148 | value 0
+ 1316 | link  300 | value 10
+ 1324 | link  308 | value 20
+ 1332 | link  604 | value 30
+ 1340 | link  900 | value 40
+ 1348 | link  908 | value 50
+ 1356 | link  988 | value 60
+ 1364 | link 1140 | value 70
+ 1372 | link 1220 | value 80
+ 1380 | link 1300 | value 90
+'''
+
+def dumpradixlink(rl, fout):
+    """dump radixlink content in a human-readable way"""
+    p = pureradixlink.radixlink
+    data = rl.data
+    linkoffsets = set()
+    descs = {} # {offset: entry-desc}
+    tovisit = [(p.headersize, 'index')] # [(offset, 'index' | 'link')]
+    visited = set([0])
+    while tovisit:
+        offset, type = tovisit.pop()
+        if offset in visited:
+            continue
+        visited.add(offset)
+        if type == 'link':
+            nextloff, value = p.linkentry.unpack_from(data, offset)
+            desc = 'link%5d | value %d' % (nextloff, value)
+            if nextloff:
+                tovisit.append((nextloff, 'link'))
+        elif type == 'index':
+            koff = offset
+            klen = p.keylen.unpack_from(data, koff)[0]
+            koff += p.keylen.size
+            loff = p.linkoffset.unpack_from(data, koff)[0]
+            linkoffsets.add(loff)
+            koff += p.linkoffset.size
+            desc = 'link%5d | ' % loff
+            tovisit.append((loff, 'link'))
+            if klen > 0:
+                key = data[koff:koff + klen]
+                desc += 'leaf  %r' % list(bytearray(key))
+            else:
+                desc += 'radix'
+                for i in xrange(16):
+                    v = p.indexoffset.unpack_from(
+                        data, koff + i * p.indexoffset.size)[0]
+                    if v > 0:
+                        desc += ' %d: %d' % (i, v)
+                        tovisit.append((v, 'index'))
+        descs[offset] = desc
+    for offset, desc in sorted(descs.items()):
+        fout.write('%5d | %s\n' % (offset, desc))
+    fout.flush()
+
+def splitstriplines(text):
+    lines = [l.strip() for l in text.splitlines()]
+    return [l for l in lines if l]
+
+class testradixlink(unittest.TestCase):
+    def randombytes(self, length, maxint):
+        s = bytearray(length)
+        for i in xrange(length):
+            s[i] = randint(0, maxint)
+        return bytes(s)
+
+    def testfixedcase(self):
+        rl = radixlink.radixlink()
+        self.assertEqual(len(rl), 4 + 72)
+        for i, k in enumerate(interestingkeys):
+            with self.assertRaises(KeyError):
+                rl[k]
+            rl.insert(k, i)
+            self.assertEqual(rl[k], [i])
+        self.assertEqual(len(rl), 1308)
+        for i, k in enumerate(interestingkeys):
+            self.assertEqual(rl[k], [i])
+            subkey = k[:len(k) - 1]
+            if subkey not in interestingkeys:
+                with self.assertRaises(KeyError):
+                    rl[subkey]
+            rl.insert(k, i * 10)
+            self.assertEqual(rl[k], [i * 10, i])
+        self.assertEqual(len(rl), 1388)
+        fout = util.stringio()
+        dumpradixlink(rl, fout)
+        self.assertEqual(splitstriplines(fout.getvalue()),
+                         splitstriplines(expectedfixedcase))
+
+    def _testrandom(self, n, minkeylen, maxkeylen, maxkeyint):
+        rl = radixlink.radixlink()
+        d = {}
+        valuemax = longmax
+        for i in range(n):
+            k = self.randombytes(randint(minkeylen, maxkeylen), maxkeyint)
+            v = randint(0, valuemax)
+            rl.insert(k, v)
+            d.setdefault(k, []).insert(0, v)
+            self.assertEqual(rl[k], d[k])
+        rl.sourceoftruthsize = truthsize = randint(0, valuemax)
+        buf = rl.data
+        rl = radixlink.radixlink(buf) # serialize and de-serialize
+        for k, v in d.items():
+            self.assertTrue(k in rl)
+            self.assertEqual(rl[k], v)
+            for altkey in (k[:len(k) - 1], k + b'\0'):
+                self.assertEqual(altkey in rl, altkey in d)
+                if altkey not in d:
+                    with self.assertRaises(KeyError):
+                        rl[altkey]
+        self.assertEqual(rl.sourceoftruthsize, truthsize)
+
+    def testrandomdense(self):
+        self._testrandom(5000, 0, 5, 5)
+        self._testrandom(5000, 10, 10, 1)
+
+    def testrandomsparse(self):
+        self._testrandom(1000, 0, 1000, 255)
+
+    @unittest.skipIf(radixlink is pureradixlink, 'unnecessary for pure')
+    def testbinarycompatibilty(self):
+        # binary representation should be the same with pure
+        rlnative = radixlink.radixlink()
+        rlpure = pureradixlink.radixlink()
+        self.assertEqual(len(rlnative), len(rlpure))
+        for i, k in enumerate(interestingkeys):
+            rlnative.insert(k, i)
+            rlpure.insert(k, i)
+        valuemax = longmax
+        for i in range(1000):
+            k = self.randombytes(randint(0, 1000), 255)
+            v = randint(0, valuemax)
+            rlnative.insert(k, v)
+            rlpure.insert(k, v)
+        truthsize = randint(0, valuemax)
+        rlnative.sourceoftruthsize = truthsize
+        rlpure.sourceoftruthsize = truthsize
+        self.assertEqual(len(rlnative), len(rlpure))
+        self.assertEqual(rlnative.data, rlpure.data)
+
+if __name__ == '__main__':
+    silenttestrunner.main(__name__)