D3990: linelog: add a Python implementation of the linelog datastructure

durin42 (Augie Fackler) phabricator at mercurial-scm.org
Wed Aug 1 18:47:23 EDT 2018


durin42 updated this revision to Diff 9759.
durin42 marked 4 inline comments as done.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D3990?vs=9681&id=9759

REVISION DETAIL
  https://phab.mercurial-scm.org/D3990

AFFECTED FILES
  contrib/wix/help.wxs
  mercurial/help/internals/linelog.txt
  mercurial/linelog.py
  tests/test-linelog.py

CHANGE DETAILS

diff --git a/tests/test-linelog.py b/tests/test-linelog.py
new file mode 100644
--- /dev/null
+++ b/tests/test-linelog.py
@@ -0,0 +1,173 @@
+from __future__ import absolute_import, print_function
+
+import difflib
+import random
+import unittest
+
+from mercurial import linelog
+
+maxlinenum = 0xffffff
+maxb1 = 0xffffff
+maxdeltaa = 10
+maxdeltab = 10
+
+def _genedits(seed, endrev):
+    lines = []
+    random.seed(seed)
+    rev = 0
+    for rev in range(0, endrev):
+        n = len(lines)
+        a1 = random.randint(0, n)
+        a2 = random.randint(a1, min(n, a1 + maxdeltaa))
+        b1 = random.randint(0, maxb1)
+        b2 = random.randint(b1, b1 + maxdeltab)
+        blines = [(rev, idx) for idx in range(b1, b2)]
+        lines[a1:a2] = blines
+        yield lines, rev, a1, a2, b1, b2
+
+class linelogtests(unittest.TestCase):
+    def testlinelogencodedecode(self):
+        program = [linelog._eof(0, 0),
+                   linelog._jge(41, 42),
+                   linelog._jump(0, 43),
+                   linelog._eof(0, 0),
+                   linelog._jl(44, 45),
+                   linelog._line(46, 47),
+                   ]
+        ll = linelog.linelog(program, maxrev=100)
+        enc = ll.encode()
+        # round-trips okay
+        self.assertEqual(linelog.linelog.fromdata(enc)._program, ll._program)
+        self.assertEqual(linelog.linelog.fromdata(enc), ll)
+        # This encoding matches the encoding used by hg-experimental's
+        # linelog file, or is supposed to if it doesn't.
+        self.assertEqual(enc, ('\x00\x00\x01\x90\x00\x00\x00\x06'
+                               '\x00\x00\x00\xa4\x00\x00\x00*'
+                               '\x00\x00\x00\x00\x00\x00\x00+'
+                               '\x00\x00\x00\x00\x00\x00\x00\x00'
+                               '\x00\x00\x00\xb1\x00\x00\x00-'
+                               '\x00\x00\x00\xba\x00\x00\x00/'))
+
+    def testsimpleedits(self):
+        ll = linelog.linelog()
+        # Initial revision: add lines 0, 1, and 2
+        ll.replacelines(1, 0, 0, 0, 3)
+        self.assertEqual([(l.rev, l.linenum) for l in ll.annotate(1)],
+                         [(1, 0),
+                          (1, 1),
+                          (1, 2),
+                         ])
+        # Replace line 1 with a new line
+        ll.replacelines(2, 1, 2, 1, 2)
+        self.assertEqual([(l.rev, l.linenum) for l in ll.annotate(2)],
+                         [(1, 0),
+                          (2, 1),
+                          (1, 2),
+                         ])
+        # delete a line out of 2
+        ll.replacelines(3, 1, 2, 0, 0)
+        self.assertEqual([(l.rev, l.linenum) for l in ll.annotate(3)],
+                         [(1, 0),
+                          (1, 2),
+                         ])
+        # annotation of 1 is unchanged
+        self.assertEqual([(l.rev, l.linenum) for l in ll.annotate(1)],
+                         [(1, 0),
+                          (1, 1),
+                          (1, 2),
+                         ])
+        ll.annotate(3) # set internal state to revision 3
+        start = ll.getoffset(0)
+        end = ll.getoffset(1)
+        self.assertEqual(ll.getalllines(start, end), [
+            (1, 0),
+            (2, 1),
+            (1, 1),
+        ])
+        self.assertEqual(ll.getalllines(), [
+            (1, 0),
+            (2, 1),
+            (1, 1),
+            (1, 2),
+        ])
+
+    def testparseclinelogfile(self):
+        # This data is what the replacements in testsimpleedits
+        # produce when fed to the original linelog.c implementation.
+        data = ('\x00\x00\x00\x0c\x00\x00\x00\x0f'
+                '\x00\x00\x00\x00\x00\x00\x00\x02'
+                '\x00\x00\x00\x05\x00\x00\x00\x06'
+                '\x00\x00\x00\x06\x00\x00\x00\x00'
+                '\x00\x00\x00\x00\x00\x00\x00\x07'
+                '\x00\x00\x00\x06\x00\x00\x00\x02'
+                '\x00\x00\x00\x00\x00\x00\x00\x00'
+                '\x00\x00\x00\t\x00\x00\x00\t'
+                '\x00\x00\x00\x00\x00\x00\x00\x0c'
+                '\x00\x00\x00\x08\x00\x00\x00\x05'
+                '\x00\x00\x00\x06\x00\x00\x00\x01'
+                '\x00\x00\x00\x00\x00\x00\x00\x05'
+                '\x00\x00\x00\x0c\x00\x00\x00\x05'
+                '\x00\x00\x00\n\x00\x00\x00\x01'
+                '\x00\x00\x00\x00\x00\x00\x00\t')
+        llc = linelog.linelog.fromdata(data)
+        self.assertEqual([(l.rev, l.linenum) for l in llc.annotate(1)],
+                         [(1, 0),
+                          (1, 1),
+                          (1, 2),
+                         ])
+        self.assertEqual([(l.rev, l.linenum) for l in llc.annotate(2)],
+                         [(1, 0),
+                          (2, 1),
+                          (1, 2),
+                         ])
+        self.assertEqual([(l.rev, l.linenum) for l in llc.annotate(3)],
+                         [(1, 0),
+                          (1, 2),
+                         ])
+        # Check we emit the same bytecode.
+        ll = linelog.linelog()
+        # Initial revision: add lines 0, 1, and 2
+        ll.replacelines(1, 0, 0, 0, 3)
+        # Replace line 1 with a new line
+        ll.replacelines(2, 1, 2, 1, 2)
+        # delete a line out of 2
+        ll.replacelines(3, 1, 2, 0, 0)
+        diff = '\n   ' + '\n   '.join(difflib.unified_diff(
+            ll.debugstr().splitlines(), llc.debugstr().splitlines(),
+            'python', 'c', lineterm=''))
+        self.assertEqual(ll._program, llc._program, 'Program mismatch: ' + diff)
+        # Done as a secondary step so we get a better result if the
+        # program is where the mismatch is.
+        self.assertEqual(ll, llc)
+        self.assertEqual(ll.encode(), data)
+
+    def testanothersimplecase(self):
+        ll = linelog.linelog()
+        ll.replacelines(3, 0, 0, 0, 2)
+        ll.replacelines(4, 0, 2, 0, 0)
+        self.assertEqual([(l.rev, l.linenum) for l in ll.annotate(4)],
+                         [])
+        self.assertEqual([(l.rev, l.linenum) for l in ll.annotate(3)],
+                         [(3, 0), (3, 1)])
+        # rev 2 is empty because contents were only ever introduced in rev 3
+        self.assertEqual([(l.rev, l.linenum) for l in ll.annotate(2)],
+                         [])
+
+    def testrandomedits(self):
+        # Inspired by original linelog tests.
+        seed = random.random()
+        numrevs = 2000
+        ll = linelog.linelog()
+        # Populate linelog
+        for lines, rev, a1, a2, b1, b2 in _genedits(seed, numrevs):
+            ll.replacelines(rev, a1, a2, b1, b2)
+            ar = ll.annotate(rev)
+            self.assertEqual(ll.annotateresult, lines)
+        # Verify we can get back these states by annotating each rev
+        for lines, rev, a1, a2, b1, b2 in _genedits(seed, numrevs):
+            ar = ll.annotate(rev)
+            self.assertEqual([(l.rev, l.linenum) for l in ar], lines)
+
+if __name__ == '__main__':
+    import silenttestrunner
+    silenttestrunner.main(__name__)
diff --git a/mercurial/linelog.py b/mercurial/linelog.py
new file mode 100644
--- /dev/null
+++ b/mercurial/linelog.py
@@ -0,0 +1,414 @@
+# linelog - efficient cache for annotate data
+#
+# Copyright 2018 Google LLC.
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+"""linelog is an efficient cache for annotate data inspired by SCCS Weaves.
+
+SCCS Weaves are an implementation of
+https://en.wikipedia.org/wiki/Interleaved_deltas. See
+mercurial/help/internals/linelog.txt for an exploration of SCCS weaves
+and how linelog works in detail.
+
+Here's a hacker's summary: a linelog is a program which is executed in
+the context of a revision. Executing the program emits information
+about lines, including the revision that introduced them and the line
+number in the file at the introducing revision. When an insertion or
+deletion is performed on the file, a jump instruction is used to patch
+in a new body of annotate information.
+"""
+from __future__ import absolute_import, print_function
+
+import abc
+import struct
+
+from mercurial import (
+    pycompat,
+)
+from .thirdparty import (
+    attr,
+)
+
+_llentry = struct.Struct('>II')
+
+class LineLogError(Exception):
+    """Error raised when something bad happens internally in linelog."""
+
+ at attr.s
+class lineinfo(object):
+    # Introducing revision of this line.
+    rev = attr.ib()
+    # Line number for this line in its introducing revision.
+    linenum = attr.ib()
+    # Private. Offset in the linelog program of this line. Used internally.
+    _offset = attr.ib()
+
+ at attr.s
+class annotateresult(object):
+    rev = attr.ib()
+    lines = attr.ib()
+    _eof = attr.ib()
+
+    def __iter__(self):
+        return iter(self.lines)
+
+class _llinstruction(object):
+
+    __metaclass__ = abc.ABCMeta
+
+    @abc.abstractmethod
+    def __init__(self, op1, op2):
+        pass
+
+    @abc.abstractmethod
+    def __str__(self):
+        pass
+
+    def __repr__(self):
+        return str(self)
+
+    @abc.abstractmethod
+    def __eq__(self, other):
+        pass
+
+    @abc.abstractmethod
+    def encode(self):
+        """Encode this instruction to the binary linelog format."""
+
+    @abc.abstractmethod
+    def execute(self, rev, pc, emit):
+        """Execute this instruction.
+
+        Args:
+          rev: The revision we're annotating.
+          pc: The current offset in the linelog program.
+          emit: A function that accepts a single lineinfo object.
+
+        Returns:
+          The new value of pc. Returns None if exeuction should stop
+          (that is, we've found the end of the file.)
+        """
+
+class _jge(_llinstruction):
+    """If the current rev is greater than or equal to op1, jump to op2."""
+
+    def __init__(self, op1, op2):
+        self._cmprev = op1
+        self._target = op2
+
+    def __str__(self):
+        return 'JGE %d %d' % (self._cmprev, self._target)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self._cmprev == other._cmprev
+                and self._target == other._target)
+
+    def encode(self):
+        return _llentry.pack(self._cmprev << 2, self._target)
+
+    def execute(self, rev, pc, emit):
+        if rev >= self._cmprev:
+            return self._target
+        return pc + 1
+
+class _jump(_llinstruction):
+    """Unconditional jumps are expressed as a JGE with op1 set to 0."""
+
+    def __init__(self, op1, op2):
+        if op1 != 0:
+            raise LineLogError("malformed JUMP, op1 must be 0, got %d" % op1)
+        self._target = op2
+
+    def __str__(self):
+        return 'JUMP %d' % (self._target)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self._target == other._target)
+
+    def encode(self):
+        return _llentry.pack(0, self._target)
+
+    def execute(self, rev, pc, emit):
+        return self._target
+
+class _eof(_llinstruction):
+    """EOF is expressed as a JGE that always jumps to 0."""
+
+    def __init__(self, op1, op2):
+        if op1 != 0:
+            raise LineLogError("malformed EOF, op1 must be 0, got %d" % op1)
+        if op2 != 0:
+            raise LineLogError("malformed EOF, op2 must be 0, got %d" % op2)
+
+    def __str__(self):
+        return 'EOF'
+
+    def __eq__(self, other):
+        return type(self) == type(other)
+
+    def encode(self):
+        return _llentry.pack(0, 0)
+
+    def execute(self, rev, pc, emit):
+        return None
+
+class _jl(_llinstruction):
+    """If the current rev is less than op1, jump to op2."""
+
+    def __init__(self, op1, op2):
+        self._cmprev = op1
+        self._target = op2
+
+    def __str__(self):
+        return 'JL %d %d' % (self._cmprev, self._target)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self._cmprev == other._cmprev
+                and self._target == other._target)
+
+    def encode(self):
+        return _llentry.pack(1 | (self._cmprev << 2), self._target)
+
+    def execute(self, rev, pc, emit):
+        if rev < self._cmprev:
+            return self._target
+        return pc + 1
+
+class _line(_llinstruction):
+    """Emit a line."""
+
+    def __init__(self, op1, op2):
+        # This line was introduced by this revision number.
+        self._rev = op1
+        # This line had the specified line number in the introducing revision.
+        self._origlineno = op2
+
+    def __str__(self):
+        return 'LINE %d %d' % (self._rev, self._origlineno)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self._rev == other._rev
+                and self._origlineno == other._origlineno)
+
+    def encode(self):
+        return _llentry.pack(2 | (self._rev << 2), self._origlineno)
+
+    def execute(self, rev, pc, emit):
+        emit(lineinfo(self._rev, self._origlineno, pc))
+        return pc + 1
+
+def _decodeone(data, offset):
+    """Decode a single linelog instruction from an offset in a buffer."""
+    try:
+        op1, op2 = _llentry.unpack_from(data, offset)
+    except struct.error as e:
+        raise LineLogError('reading an instruction failed: %r' % e)
+    opcode = op1 & 0b11
+    op1 = op1 >> 2
+    if opcode == 0:
+        if op1 == 0:
+            if op2 == 0:
+                return _eof(op1, op2)
+            return _jump(op1, op2)
+        return _jge(op1, op2)
+    elif opcode == 1:
+        return _jl(op1, op2)
+    elif opcode == 2:
+        return _line(op1, op2)
+    raise NotImplementedError('Unimplemented opcode %r' % opcode)
+
+class linelog(object):
+    """Efficient cache for per-line history information."""
+
+    def __init__(self, program=None, maxrev=0):
+        if program is None:
+            # We pad the program with an extra leading EOF so that our
+            # offsets will match the C code exactly. This means we can
+            # interoperate with the C code.
+            program = [_eof(0, 0), _eof(0, 0)]
+        self._program = program
+        self._lastannotate = None
+        self._maxrev = maxrev
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self._program == other._program
+                and self._maxrev == other._maxrev)
+
+    def __repr__(self):
+        return '<linelog at %s: maxrev=%d size=%d>' % (
+            hex(id(self)), self._maxrev, len(self._program))
+
+    def debugstr(self):
+        fmt = '%%%dd %%s' % len(str(len(self._program)))
+        return '\n'.join(
+            fmt % (idx, i) for idx, i in enumerate(self._program[1:], 1))
+
+    @classmethod
+    def fromdata(cls, buf):
+        if len(buf) % _llentry.size != 0:
+            raise LineLogError(
+                "invalid linelog buffer size %d (must be a multiple of %d)" % (
+                    len(buf), _llentry.size))
+        expected = len(buf) / _llentry.size
+        fakejge = _decodeone(buf, 0)
+        if isinstance(fakejge, _jump):
+            maxrev = 0
+        else:
+            maxrev = fakejge._cmprev
+        numentries = fakejge._target
+        if expected != numentries:
+            raise LineLogError("corrupt linelog data: claimed"
+                               " %d entries but given data for %d entries" % (
+                                   expected, numentries))
+        instructions = [_eof(0, 0)]
+        for offset in pycompat.xrange(1, numentries):
+            instructions.append(_decodeone(buf, offset * _llentry.size))
+        return cls(instructions, maxrev=maxrev)
+
+    def encode(self):
+        hdr = _jge(self._maxrev, len(self._program)).encode()
+        return hdr + ''.join(i.encode() for i in self._program[1:])
+
+    def clear(self):
+        self._program = []
+        self._maxrev = 0
+        self._lastannotate = None
+
+    def replacelines(self, rev, a1, a2, b1, b2):
+        """Replace lines [a1, a2) with lines [b1, b2)."""
+        if self._lastannotate:
+            # TODO(augie): make replacelines() accept a revision at
+            # which we're editing as well as a revision to mark
+            # responsible for the edits. In hg-experimental it's
+            # stateful like this, so we're doing the same thing to
+            # retain compatibility with absorb until that's imported.
+            ar = self._lastannotate
+        else:
+            ar = self.annotate(rev)
+            #        ar = self.annotate(self._maxrev)
+        if a1 > len(ar.lines):
+            raise LineLogError(
+                '%d contains %d lines, tried to access line %d' % (
+                    rev, len(ar.lines), a1))
+        elif a1 == len(ar.lines):
+            # Simulated EOF instruction since we're at EOF, which
+            # doesn't have a "real" line.
+            a1inst = _eof(0, 0)
+            a1info = lineinfo(0, 0, ar._eof)
+        else:
+            a1info = ar.lines[a1]
+            a1inst = self._program[a1info._offset]
+        oldproglen = len(self._program)
+        appendinst = self._program.append
+
+        # insert
+        if b1 < b2:
+            # Determine the jump target for the JGE at the start of
+            # the new block.
+            tgt = oldproglen + (b2 - b1 + 1)
+            # Jump to skip the insert if we're at an older revision.
+            appendinst(_jl(rev, tgt))
+            for linenum in pycompat.xrange(b1, b2):
+                appendinst(_line(rev, linenum))
+        # delete
+        if a1 < a2:
+            if a2 > len(ar.lines):
+                raise LineLogError(
+                    '%d contains %d lines, tried to access line %d' % (
+                        rev, len(ar.lines), a2))
+            elif a2 == len(ar.lines):
+                endaddr = ar._eof
+            else:
+                endaddr = ar.lines[a2]._offset
+            if a2 > 0 and rev < self._maxrev:
+                # If we're here, we're deleting a chunk of an old
+                # commit, so we need to be careful and not touch
+                # invisible lines between a2-1 and a2 (IOW, lines that
+                # are added later).
+                endaddr = ar.lines[a2 - 1]._offset + 1
+            appendinst(_jge(rev, endaddr))
+        # copy instruction from a1
+        appendinst(a1inst)
+        # if a1inst isn't a jump or EOF, then we need to add an unconditional
+        # jump back into the program here.
+        if not isinstance(a1inst, (_jump, _eof)):
+            appendinst(_jump(0, a1info._offset + 1))
+        # Patch instruction at a1, which makes our patch live.
+        self._program[a1info._offset] = _jump(0, oldproglen)
+        # For compat with the C version, re-annotate rev so that
+        # self.annotateresult is cromulent.. We could fix up the
+        # annotateresult in place (which is how the C version works),
+        # but for now we'll pass on that and see if it matters in
+        # practice.
+        self.annotate(max(self._lastannotate.rev, rev))
+        if rev > self._maxrev:
+            self._maxrev = rev
+
+    def annotate(self, rev):
+        pc = 1
+        lines = []
+        # Sanity check: if len(lines) is longer than len(program), we
+        # hit an infinite loop in the linelog program somehow and we
+        # should stop.
+        while pc is not None and len(lines) < len(self._program):
+            inst = self._program[pc]
+            lastpc = pc
+            pc = inst.execute(rev, pc, lines.append)
+        if pc is not None:
+            raise LineLogError(
+                'Probably hit an infinite loop in linelog. Program:\n' +
+                self.debugstr())
+        ar = annotateresult(rev, lines, lastpc)
+        self._lastannotate = ar
+        return ar
+
+    @property
+    def maxrev(self):
+        return self._maxrev
+
+    # Stateful methods which depend on the value of the last
+    # annotation run. This API is for compatiblity with the original
+    # linelog, and we should probably consider refactoring it.
+    @property
+    def annotateresult(self):
+        """Return the last annotation result. C linelog code exposed this."""
+        return [(l.rev, l.linenum) for l in self._lastannotate.lines]
+
+    def getoffset(self, line):
+        return self._lastannotate.lines[line]._offset
+
+    def getalllines(self, start=0, end=0):
+        """Get all lines that ever occurred in [start, end).
+
+        Passing start == end == 0 means "all lines ever".
+
+        This works in terms of *internal* program offsets, not line numbers.
+        """
+        pc = start or 1
+        lines = []
+        # only take as many steps as there are instructions in the
+        # program - if we don't find an EOF or our stop-line before
+        # then, something is badly broken.
+        for step in pycompat.xrange(len(self._program)):
+            inst = self._program[pc]
+            nextpc = pc + 1
+            if isinstance(inst, _jump):
+                nextpc = inst._target
+            elif isinstance(inst, _eof):
+                return lines
+            elif isinstance(inst, (_jl, _jge)):
+                pass
+            elif isinstance(inst, _line):
+                lines.append((inst._rev, inst._origlineno))
+            else:
+                raise LineLogError("Illegal instruction %r" % inst)
+            if nextpc == end:
+                return lines
+            pc = nextpc
+        raise LineLogError("Failed to perform getalllines")
diff --git a/mercurial/help/internals/linelog.txt b/mercurial/help/internals/linelog.txt
new file mode 100644
--- /dev/null
+++ b/mercurial/help/internals/linelog.txt
@@ -0,0 +1,251 @@
+linelog is a storage format inspired by the "Interleaved deltas" idea. See
+https://en.wikipedia.org/wiki/Interleaved_deltas for its introduction.
+
+0. SCCS Weave
+
+  To understand what linelog is, first we have a quick look at a simplified
+  (with header removed) SCCS weave format, which is an implementation of the
+  "Interleaved deltas" idea.
+
+0.1 Basic SCCS Weave File Format
+
+  A SCCS weave file consists of plain text lines. Each line is either a
+  special instruction starting with "^A" or part of the content of the real
+  file the weave tracks. There are 3 important operations, where REV denotes
+  the revision number:
+
+    ^AI REV, marking the beginning of an insertion block introduced by REV
+    ^AD REV, marking the beginning of a deletion block introduced by REV
+    ^AE REV, marking the end of the block started by "^AI REV" or "^AD REV"
+
+  Note on revision numbers: For any two different revision numbers, one must
+  be an ancestor of the other to make them comparable. This enforces linear
+  history. Besides, the comparison functions (">=", "<") should be efficient.
+  This means, if revisions are strings like git or hg, an external map is
+  required to convert them into integers.
+
+  For example, to represent the following changes:
+
+    REV 1 | REV 2 | REV 3
+    ------+-------+-------
+    a     | a     | a
+    b     | b     | 2
+    c     | 1     | c
+          | 2     |
+          | c     |
+
+  A possible weave file looks like:
+
+    ^AI 1
+    a
+    ^AD 3
+    b
+    ^AI 2
+    1
+    ^AE 3
+    2
+    ^AE 2
+    c
+    ^AE 1
+
+  An "^AE" does not always match its nearest operation ("^AI" or "^AD"). In
+  the above example, "^AE 3" does not match the nearest "^AI 2" but "^AD 3".
+  Therefore we need some extra information for "^AE". The SCCS weave uses a
+  revision number. It could also be a boolean value about whether it is an
+  insertion or a deletion (see section 0.4).
+
+0.2 Checkout
+
+  The "checkout" operation is to retrieve file content at a given revision,
+  say X. It's doable by going through the file line by line and:
+
+    - If meet ^AI rev, and rev > X, find the corresponding ^AE and jump there
+    - If meet ^AD rev, and rev <= X, find the corresponding ^AE and jump there
+    - Ignore ^AE
+    - For normal lines, just output them
+
+0.3 Annotate
+
+  The "annotate" operation is to show extra metadata like the revision number
+  and the original line number a line comes from.
+
+  It's basically just a "Checkout". For the extra metadata, they can be stored
+  side by side with the line contents. Alternatively, we can infer the
+  revision number from "^AI"s.
+
+  Some SCM tools have to calculate diffs on the fly and thus are much slower
+  on this operation.
+
+0.4 Tree Structure
+
+  The word "interleaved" is used because "^AI" .. "^AE" and "^AD" .. "^AE"
+  blocks can be interleaved.
+
+  If we consider insertions and deletions separately, they can form tree
+  structures, respectively.
+
+    +--- ^AI 1        +--- ^AD 3
+    | +- ^AI 2        | +- ^AD 2
+    | |               | |
+    | +- ^AE 2        | +- ^AE 2
+    |                 |
+    +--- ^AE 1        +--- ^AE 3
+
+  More specifically, it's possible to build a tree for all insertions, where
+  the tree node has the structure "(rev, startline, endline)". "startline" is
+  the line number of "^AI" and "endline" is the line number of the matched
+  "^AE".  The tree will have these properties:
+
+    1. child.rev > parent.rev
+    2. child.startline > parent.startline
+    3. child.endline < parent.endline
+
+  A similar tree for all deletions can also be built with the first property
+  changed to:
+
+    1. child.rev < parent.rev
+
+0.5 Malformed Cases
+
+  The following cases are considered malformed in our implementation:
+
+    1. Interleaved insertions, or interleaved deletions.
+       It can be rewritten to a non-interleaved tree structure.
+
+       ^AI/D x     ^AI/D x
+       ^AI/D y  -> ^AI/D y
+       ^AE x       ^AE y
+       ^AE y       ^AE x
+
+    2. Nested insertions, where the inner one has a smaller revision number.
+       It can be rewritten to a non-nested form.
+
+       ^AI x + 1     ^AI x + 1
+       ^AI x      -> ^AE x + 1
+       ^AE x         ^AI x
+       ^AE x + 1     ^AE x
+
+    3. Insertion or deletion inside another deletion, where the outer deletion
+       block has a smaller revision number.
+
+       ^AD x          ^AD x
+       ^AI/D x + 1 -> ^AE x
+       ^AE x + 1      ^AI/D x + 1
+       ^AE x          ^AE x
+
+  Some of them may be valid in other implementations for special purposes. For
+  example, to "revive" a previously deleted block in a newer revision.
+
+0.6 Cases Can Be Optimized
+
+  It's always better to get things nested. For example, the left is more
+  efficient than the right while they represent the same content:
+
+    +--- ^AD 2          +- ^AD 1
+    | +- ^AD 1          |   LINE A
+    | |   LINE A        +- ^AE 1
+    | +- ^AE 1          +- ^AD 2
+    |     LINE B        |   LINE B
+    +--- ^AE 2          +- ^AE 2
+
+  Our implementation sometimes generates the less efficient data. To always
+  get the optimal form, it requires extra code complexity that seems unworthy.
+
+0.7 Inefficiency
+
+  The file format can be slow because:
+
+  - Inserting a new line at position P requires rewriting all data after P.
+  - Finding "^AE" requires walking through the content (O(N), where N is the
+    number of lines between "^AI/D" and "^AE").
+
+1. Linelog
+
+  The linelog is a binary format that dedicates to speed up mercurial (or
+  git)'s "annotate" operation. It's designed to avoid issues mentioned in
+  section 0.7.
+
+1.1 Content Stored
+
+  Linelog is not another storage for file contents. It only stores line
+  numbers and corresponding revision numbers, instead of actual line content.
+  This is okay for the "annotate" operation because usually the external
+  source is fast to checkout the content of a file at a specific revision.
+
+  A typical SCCS weave is also fast on the "grep" operation, which needs
+  random accesses to line contents from different revisions of a file. This
+  can be slow with linelog's no-line-content design. However we could use
+  an extra map ((rev, line num) -> line content) to speed it up.
+
+  Note the revision numbers in linelog should be independent from mercurial
+  integer revision numbers. There should be some mapping between linelog rev
+  and hg hash stored side by side, to make the files reusable after being
+  copied to another machine.
+
+1.2 Basic Format
+
+  A linelog file consists of "instruction"s. An "instruction" can be either:
+
+    - JGE  REV ADDR     # jump to ADDR if rev >= REV
+    - JL   REV ADDR     # jump to ADDR if rev < REV
+    - LINE REV LINENUM  # append the (LINENUM+1)-th line in revision REV
+
+  For example, here is the example linelog representing the same file with
+  3 revisions mentioned in section 0.1:
+
+    SCCS  |    Linelog
+    Weave | Addr : Instruction
+    ------+------+-------------
+    ^AI 1 |    0 : JL   1 8
+    a     |    1 : LINE 1 0
+    ^AD 3 |    2 : JGE  3 6
+    b     |    3 : LINE 1 1
+    ^AI 2 |    4 : JL   2 7
+    1     |    5 : LINE 2 2
+    ^AE 3 |
+    2     |    6 : LINE 2 3
+    ^AE 2 |
+    c     |    7 : LINE 1 2
+    ^AE 1 |
+          |    8 : END
+
+  This way, "find ^AE" is O(1) because we just jump there. And we can insert
+  new lines without rewriting most part of the file by appending new lines and
+  changing a single instruction to jump to them.
+
+  The current implementation uses 64 bits for an instruction: The opcode (JGE,
+  JL or LINE) takes 2 bits, REV takes 30 bits and ADDR or LINENUM takes 32
+  bits. It also stores the max revision number and buffer size at the first
+  64 bits for quick access to these values.
+
+1.3 Comparing with Mercurial's revlog format
+
+  Apparently, linelog is very different from revlog: linelog stores rev and
+  line numbers, while revlog has line contents and other metadata (like
+  parents, flags). However, the revlog format could also be used to store rev
+  and line numbers. For example, to speed up the annotate operation, we could
+  also pre-calculate annotate results and just store them using the revlog
+  format.
+
+  Therefore, linelog is actually somehow similar to revlog, with the important
+  trade-off that it only supports linear history (mentioned in section 0.1).
+  Essentially, the differences are:
+
+    a) Linelog is full of deltas, while revlog could contain full file
+       contents sometimes. So linelog is smaller. Revlog could trade
+       reconstruction speed for file size - best case, revlog is as small as
+       linelog.
+    b) The interleaved delta structure allows skipping large portion of
+       uninteresting deltas so linelog's content reconstruction is faster than
+       the delta-only version of revlog (however it's possible to construct
+       a case where interleaved deltas degrade to plain deltas, so linelog
+       worst case would be delta-only revlog). Revlog could trade file size
+       for reconstruction speed.
+    c) Linelog implicitly maintains the order of all lines it stores. So it
+       could dump all the lines from all revisions, with a reasonable order.
+       While revlog could also dump all line additions, it requires extra
+       computation to figure out the order putting those lines - that's some
+       kind of "merge".
+
+  "c" makes "hg absorb" easier to implement and makes it possible to do
+  "annotate --deleted".
diff --git a/contrib/wix/help.wxs b/contrib/wix/help.wxs
--- a/contrib/wix/help.wxs
+++ b/contrib/wix/help.wxs
@@ -46,6 +46,7 @@
             <File Id="internals.censor.txt"       Name="censor.txt" />
             <File Id="internals.changegroups.txt" Name="changegroups.txt" />
             <File Id="internals.config.txt"       Name="config.txt" />
+            <File Id="internals.linelog.txt"      Name="linelog.txt" />
             <File Id="internals.requirements.txt" Name="requirements.txt" />
             <File Id="internals.revlogs.txt"      Name="revlogs.txt" />
             <File Id="internals.wireprotocol.txt" Name="wireprotocol.txt" />



To: durin42, #hg-reviewers, indygreg
Cc: indygreg, mercurial-devel


More information about the Mercurial-devel mailing list