[PATCH 04 of 10 lazy-changelog-parse] changelog: lazily parse manifest node

Gregory Szorc gregory.szorc at gmail.com
Sun Mar 6 18:58:50 EST 2016


# HG changeset patch
# User Gregory Szorc <gregory.szorc at gmail.com>
# Date 1457303353 28800
#      Sun Mar 06 14:29:13 2016 -0800
# Node ID a42fc3233c0886129c9e71b201f9f01880bf9e75
# Parent  d85951413907594c2cb37744ce8b01de2b030930
changelog: lazily parse manifest node

Like the description, we store the raw bytes and convert from
hex on access.

This patch also marks the beginning of our new parsing method,
which is based on newline offsets and doesn't rely on
str.split().

Many revsets showed a performance improvement:

author(mpm)
0.896565
0.869085
0.868598

desc(bug)
0.887169
0.928164
0.910400

extra(rebase_source)
0.865446
0.871500
0.841644

author(mpm) or author(greg)
1.801832
1.791589
1.731503

author(mpm) or desc(bug)
1.812438
1.851003
1.798764

date(2015) or branch(default)
0.968276
0.974027
0.945792

diff --git a/mercurial/changelog.py b/mercurial/changelog.py
--- a/mercurial/changelog.py
+++ b/mercurial/changelog.py
@@ -150,17 +150,17 @@ class changelogrevision(object):
     the parsed object.
     """
 
     __slots__ = (
         'date',
         '_rawdesc',
         'extra',
         'files',
-        'manifest',
+        '_rawmanifest',
         'user',
     )
 
     def __new__(cls, text):
         if not text:
             return _changelogrevision(
                 manifest=nullid,
                 user='',
@@ -183,18 +183,20 @@ class changelogrevision(object):
         # files\n\n       : files modified by the cset, no \n or \r allowed
         # (.*)            : comment (free text, ideally utf-8)
         #
         # changelog v0 doesn't use extra
 
         doublenl = text.index('\n\n')
         self._rawdesc = text[doublenl + 2:]
 
+        nl1 = text.index('\n')
+        self._rawmanifest = text[0:nl1]
+
         l = text[:doublenl].split('\n')
-        self.manifest = bin(l[0])
         self.user = encoding.tolocal(l[1])
 
         tdata = l[2].split(' ', 2)
         if len(tdata) != 3:
             time = float(tdata[0])
             try:
                 # various tools did silly things with the time zone field.
                 timezone = int(tdata[1])
@@ -206,16 +208,20 @@ class changelogrevision(object):
             self.extra = decodeextra(tdata[2])
 
         self.date = (time, timezone)
         self.files = l[3:]
 
         return self
 
     @property
+    def manifest(self):
+        return bin(self._rawmanifest)
+
+    @property
     def description(self):
         return encoding.tolocal(self._rawdesc)
 
 class changelog(revlog.revlog):
     def __init__(self, opener):
         revlog.revlog.__init__(self, opener, "00changelog.i")
         if self._initempty:
             # changelogs don't benefit from generaldelta


More information about the Mercurial-devel mailing list