[PATCH 1 of 6 RFC] radixlink: introduce a new data structure

Jun Wu quark at fb.com
Mon May 22 01:31:08 UTC 2017


# HG changeset patch
# User Jun Wu <quark at fb.com>
# Date 1495396237 25200
#      Sun May 21 12:50:37 2017 -0700
# Node ID fec85b1af16b0360e7bd78cd26d4d21edf678962
# Parent  4e51b2a99847904f1cc5a9c74784f19d69e829d0
# Available At https://bitbucket.org/quark-zju/hg-draft
#              hg pull https://bitbucket.org/quark-zju/hg-draft -r fec85b1af16b
radixlink: introduce a new data structure

The radixlink structure is designed to address several hard-to-solve
performance issues in Mercurial. It is an on-disk radix-tree as index,
linking to linked list as data (so it could store multiple values per key).

For long, core Mercurial has no real key-value data structure, and various
operations have suboptimal time complexity:

  1. changelog.rev(node): O(len(repo))
  2. obsstore.precursors[node]: O(len(markers))
  3. adjustlinkrev slow path (a complex problem)

The new data structure could make the first two O(len(node)), and provide a
cache for linkrevs where the keys could be "filenode + path + '\0'".

The next patches will try to solve the obsstore performance issue.

diff --git a/mercurial/radixlink.py b/mercurial/radixlink.py
new file mode 100644
--- /dev/null
+++ b/mercurial/radixlink.py
@@ -0,0 +1,244 @@
+# radixlink.py - on-disk radixtree index pointing to linked list data
+#
+# Copyright 2017 Facebook, Inc.
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+from __future__ import absolute_import
+
+import os
+import struct
+
+from . import (
+    error,
+)
+
+def _enlarge(buf, size):
+    """enlarge a bytearray to at least given size"""
+    l = len(buf)
+    if l < size:
+        buf += bytearray(size - l)
+
+def _tob16(buf):
+    """convert buffer to base16 bytearray"""
+    result = bytearray(len(buf) * 2)
+    for i, b in enumerate(bytearray(buf)):
+        result[i * 2] = b >> 4
+        result[i * 2 + 1] = b & 0xf
+    return result
+
+class radixlink(object):
+    """multimap with O(len(key) + len([value])) lookup and O(len(key)) insertion
+
+    A "radixlink" structure consists of 2 files: indexfile, and linkfile.
+
+    The indexfile is the radix tree with pointers to linkfile:
+
+        indexdata        := index-header + index-entry-list
+        index-header     := source of truth offset (8B)
+        index-entry-list := '' | index-entry-list + index-entry
+        index-entry      := radix-entry | leaf-entry
+        radix-entry      := '\0' + index-offset (4B) * 16
+        leaf-entry       := '\1' + link-offset (4B) + key-length (2B) + key
+
+        note: leaf-entry is at least len(radix-entry) long so it could be
+        converted to radix-entry in-place.
+
+    The linkfile is an append-only linked list:
+
+        linkdata        := link-header + link-entry-list
+        link-header     := '\0'
+        link-entry-list := '' | link-entry-list + link-entry
+        link-entry      := next-link-offset (4B) + value (4B)
+
+    radixlink should be used as caches, source of truth should be an
+    append-only data structure. The header in indexfile consists an offset
+    about the append-only source of truth so revlink could be updated
+    incrementally.
+
+    radixlink does not support two keys where one is prefix of the other.
+    Application should use same-length keys, or append unique suffix to avoid
+    that.
+    """
+
+    indexheader = struct.Struct('>Q')
+    indexoffset = struct.Struct('>I')
+    linkoffset = struct.Struct('>I')
+
+    keylen = struct.Struct('>H')
+    radixentrysize = 1 + indexoffset.size * 16
+
+    radixentrytype = 0
+    leafentrytype = 1
+
+    linkentry = struct.Struct('>II')
+
+    emptyindex = b'\0' * (indexheader.size + radixentrysize)
+    emptylink = b'\0'
+
+    def __init__(self, vfs, indexfile, linkfile=None):
+        linkfile = linkfile or indexfile + '.l'
+        self.vfs = vfs
+        self.indexfile = indexfile
+        self.linkfile = linkfile
+        self.indexdata = bytearray(vfs.tryread(indexfile) or self.emptyindex)
+        self.linkdata = bytearray(vfs.tryread(linkfile) or self.emptylink)
+        self._dirty = False
+
+    def flush(self):
+        if self._dirty:
+            for path, data in [(self.linkfile, self.linkdata),
+                               (self.indexfile, self.indexdata)]:
+                f = self.vfs(path, 'wb', atomictemp=True)
+                f.write(data)
+                f.close()
+            self._dirty = False
+
+    @property
+    def truthoffset(self):
+        """position of the append-only source of truth the index contains"""
+        if len(self.indexdata) >= self.indexheader.size:
+            return self.indexheader.unpack_from(self.indexdata)[0]
+        return 0
+
+    @truthoffset.setter
+    def truthoffset(self, newoffset):
+        if self.truthoffset == newoffset:
+            return
+        self.indexheader.pack_into(self.indexdata, 0, newoffset)
+        self._dirty = True
+
+    def destroy(self):
+        """destory the revlink cache so it has to be built from scratch"""
+        self.vfs.tryunlink(self.indexfile)
+        self.vfs.tryunlink(self.linkfile)
+        self.indexdata = bytearray(self.emptyindex)
+        self.linkdata = bytearray(self.emptylink)
+
+    def get(self, key):
+        """key: bytearray"""
+        # starting index offset and suboffset
+        ioff = self.indexheader.size
+        bkey = _tob16(key)
+        boff = 0
+        while boff < len(bkey):
+            ioff, boff, _poff = self._followindex(ioff, bkey, boff)
+            if ioff == 0:
+                return []
+        # read linkdata
+        ioff, loff = self._readlinkoffset(ioff)
+        result = []
+        while loff != 0:
+            loff, value = self.linkentry.unpack_from(self.linkdata, loff)
+            result.append(value)
+        return result
+
+    def insert(self, key, value):
+        """key: int, value: buffer"""
+        ioff = self.indexheader.size
+        bkey = _tob16(key)
+        boff = 0
+        data = self.indexdata
+        while boff < len(bkey):
+            ioff, boff, poff = self._followindex(ioff, bkey, boff, True)
+            if ioff == 0:
+                # create new leaf
+                assert poff != 0
+                ioff = self._appendleafentry(bkey[boff:])
+                self.indexoffset.pack_into(data, poff, ioff)
+                break
+
+        # update linked list
+        ioff, loff = self._readlinkoffset(ioff)
+        newloff = self._appendlinkvalue(value, loff)
+        self.linkoffset.pack_into(data, ioff, newloff)
+        self._dirty = True
+
+    def _followindex(self, offset, b16list, b16offset, splitleaf=False):
+        """follow index at char b16list[b16offset]
+
+        return (offset, b16offset, poffset) or raise
+        """
+        data = self.indexdata
+        type = data[offset]
+        if data[offset] == self.leafentrytype:
+            koff = offset + 1 + self.linkoffset.size
+            klen = self.keylen.unpack_from(data, koff)[0]
+            koff += self.keylen.size
+            if data[koff:koff + klen] == b16list[b16offset:b16offset + klen]:
+                if len(b16list) - b16offset > klen:
+                    raise error.ProgrammingError('existing key cannot be '
+                                                 'prefix of new key')
+                return (offset, len(b16list), 0)
+            if not splitleaf:
+                return (0, b16offset, 0)
+            self._splitleaf(offset)
+        # radix entry
+        assert data[offset] == self.radixentrytype
+        b16 = b16list[b16offset]
+        poffset = offset + 1 + b16 * self.indexoffset.size
+        nextoffset = self.indexoffset.unpack_from(data, poffset)[0]
+        return (nextoffset, b16offset + 1, poffset)
+
+    def _readlinkoffset(self, ioffset):
+        """given offset of an index-entry, return (index-offset, link-offset)
+
+        index-offset is an offset in indexdata, where link-offset is stored.
+        link-offset is an offset in linkdata.
+        """
+        # must be a leaf-entry
+        type = self.indexdata[ioffset]
+        ioffset += 1
+        if type != self.leafentrytype:
+            raise error.ProgrammingError('key cannot be prefix of existing key')
+        loffset = self.linkoffset.unpack_from(self.indexdata, ioffset)[0]
+        return (ioffset, loffset)
+
+    def _appendleafentry(self, bkey, linkoffset=0):
+        """append a leaf entry to indexdata, return offset of that entry"""
+        # leaf-entry := '\1' + link-offset (4B) + key-length (2B) + key
+        data = self.indexdata
+        offset = len(data)
+        buf = bytearray([self.leafentrytype])
+        buf += self.linkoffset.pack(linkoffset)
+        buf += self.keylen.pack(len(bkey))
+        buf += bkey
+        # leaf-entry is at least len(radix-entry) long
+        _enlarge(buf, self.radixentrysize)
+        data += buf
+        return offset
+
+    def _splitleaf(self, offset):
+        """split a leaf entry into a radix entry and another leaf entry
+
+        leaf entry at offset will be turned into a radix entry.
+        return new offset of the new leaf entry.
+        """
+        data = self.indexdata
+        assert data[offset] == self.leafentrytype
+
+        loff = self.linkoffset.unpack_from(data, offset + 1)[0]
+        koff = offset + 1 + self.linkoffset.size
+        klen = self.keylen.unpack_from(data, koff)[0]
+        if klen == 0:
+            raise error.ProgrammingError('existing key cannot be prefix '
+                                         'of new key')
+        koff += self.keylen.size
+        key = data[koff:koff + klen]
+
+        # create a new leaf entry
+        newoffset = self._appendleafentry(key[1:], loff)
+
+        # rewrite entry to radix in-place
+        radixbuf = bytearray(self.radixentrysize)
+        self.indexoffset.pack_into(radixbuf, 1 + key[0] *
+                                   self.indexoffset.size, newoffset)
+        data[offset:offset + len(radixbuf)] = radixbuf
+        return newoffset
+
+    def _appendlinkvalue(self, value, nextoffset=0):
+        """append value to linkdata, return offset"""
+        offset = len(self.linkdata)
+        entry = self.linkentry.pack(nextoffset, value)
+        self.linkdata += entry
+        return offset
diff --git a/tests/test-radixlink.py b/tests/test-radixlink.py
new file mode 100644
--- /dev/null
+++ b/tests/test-radixlink.py
@@ -0,0 +1,61 @@
+from __future__ import absolute_import
+
+import os
+import random
+
+from mercurial import (
+    error,
+    radixlink,
+    vfs,
+)
+
+randint = random.randint
+
+def ensure(expr):
+    if not expr:
+        raise RuntimeError, 'unexpected'
+
+fs = vfs.vfs(os.environ['TESTTMP'])
+
+rl1 = radixlink.radixlink(fs, 'rl-test')
+rl1.destroy()
+
+# basic functions
+ensure(rl1.get('notfound') == [])
+ensure(rl1.truthoffset == 0)
+
+# prefix key detection
+rl1.insert('abc', 3)
+for buggykey in ['abcdef', 'ab']:
+    try:
+        rl1.insert('abcdef', 3)
+        raise RuntimeError, 'k1 being prefix of k2 should not be allowed'
+    except error.ProgrammingError:
+        pass
+rl1.destroy()
+
+# insert random data
+def urandom(n):
+    s = bytearray(n + 1)
+    for i in xrange(n):
+        s[i] = randint(0, 254)
+    s[n] = 255
+    return s
+
+d = {}
+for i in range(10000):
+    k = bytes(urandom(randint(0, 20)))
+    v = randint(0, 4294967295)
+    rl1.insert(k, v)
+    d.setdefault(k, []).insert(0, v)
+    v2 = rl1.get(k)
+    ensure(v2[0] == v)
+
+rl1.truthoffset = 333
+rl1.flush()
+
+# reload and verify against d
+rl2 = radixlink.radixlink(fs, 'rl-test')
+ensure(rl2.truthoffset == 333)
+for k, vs in d.items():
+    ensure(rl2.get(k) == vs)


More information about the Mercurial-devel mailing list