[PATCH 1 of 4] revset: replace "list(repo)" for efficiency on large repository

FUJIWARA Katsunori foozy at lares.dti.ne.jp
Fri Mar 29 11:16:31 CDT 2013


# HG changeset patch
# User FUJIWARA Katsunori <foozy at lares.dti.ne.jp>
# Date 1364573132 -32400
# Node ID 4cf0465cd64ff196ad83ec2d10b6e13ed89c2913
# Parent  fabbaa250977ad337a36b1c4cece22da94adfe4b
revset: replace "list(repo)" for efficiency on large repository

Before this patch, "list(repo)" is used as "subset" to mean "whole
revisions in repository".  This creation of list is not efficient in
points below:

   - this causes immediate creation of list containing integers
     corresponding to revisions in repository:

     repository may have many revisions in itself

  - "subset" contents may not be used in some cases:

    "stringset()" omits examination of subset contents, if "len(repo)
    == len(subset)", for example

  - examination of "x in subset" may scans list fully

This patch replaces "list(repo)" by "_safesubset(repo)" to avoid
problems above.

This patch uses "_safesubset(repo)" instead of using "repo" directly,
because "localrepository" class overrides "__nonzero__()" as the
function returning True always: this overriding may cause unexpected
result in "if subset"/"if not subset" examinations.

This patch affects predicates below:

  - dagrange("::")
  - ancestor
  - ancestors(invoking _ancestors internally)
  - branch
  - children
  - descendants(invoking _descendants internally)
  - destination
  - limit
  - last
  - max
  - min
  - origin
  - p1
  - p2
  - parents

Results of "hg perfrevset" for each revspec (before/after this patch)
on the repository containing 40000 revisions are shown below:

  - "tip::tip":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 888)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1125)

  - "ancestor(1)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 2105)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 3847)

  - "ancestors(1)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 861)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1055)

  - "tip and branch(tip)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1887)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 3290)

  - "tip and children(39998)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 2026)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 3650)

  - "descendants(tip)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 869)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1085)

  - "tip and destination(39998)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1989)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 3525)

  - "limit(tip)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 894)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1140)

  - "last(tip)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 890)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1142)

  - "max(tip)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1381)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1969)

  - "min(tip)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1378)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1958)

  - "39998 and origin(tip)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 2001)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 3519)

  - "p1(tip)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 880)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1079)

  - "p2(tip)":
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 877)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1082)

  - "parents(tip)"
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 871)
    ! wall 0.000000 comb 0.000000 user 0.000000 sys 0.000000 (best of 1063)

diff -r fabbaa250977 -r 4cf0465cd64f contrib/check-code.py
--- a/contrib/check-code.py	Sat Feb 09 21:51:21 2013 +0000
+++ b/contrib/check-code.py	Sat Mar 30 01:05:32 2013 +0900
@@ -279,6 +279,15 @@
   []
 ]
 
+inrevsetpats = [
+  [
+    (r'\blist\(repo\)',
+     "use _safesubset(repo) as subset instead of list(repo) in revset"),
+  ],
+  # warnings
+  []
+]
+
 checks = [
     ('python', r'.*\.(py|cgi)$', pyfilters, pypats),
     ('test script', r'(.*/)?test-[^.~]*$', testfilters, testpats),
@@ -288,6 +297,8 @@
      inrevlogpats),
     ('layering violation ui in util', r'mercurial/util\.py', pyfilters,
      inutilpats),
+    ('inefficiency for large repo in revset', r'mercurial/revset\.py',
+     pyfilters, inrevsetpats),
 ]
 
 class norepeatlogger(object):
diff -r fabbaa250977 -r 4cf0465cd64f mercurial/revset.py
--- a/mercurial/revset.py	Sat Feb 09 21:51:21 2013 +0000
+++ b/mercurial/revset.py	Sat Mar 30 01:05:32 2013 +0900
@@ -206,6 +206,18 @@
                 pass
     return None
 
+# this prevents objects overriding '__nonzero__' (e.g. localrepository)
+# from being evaluated as not empty subset even when they are empty
+class _safesubset(object):
+    def __init__(self, o):
+        self._o = o
+    def __len__(self):
+        return len(self._o)
+    def __iter__(self):
+        return iter(self._o)
+    def __contains__(self, key):
+        return key in self._o
+
 # operator methods
 
 def stringset(repo, subset, x):
@@ -239,7 +251,7 @@
 
 def dagrange(repo, subset, x, y):
     if subset:
-        r = list(repo)
+        r = _safesubset(repo)
         xs = _revsbetween(repo, getset(repo, r, x), getset(repo, r, y))
         s = set(subset)
         return [r for r in xs if r in s]
@@ -286,7 +298,7 @@
     """
     # i18n: "ancestor" is a keyword
     l = getlist(x)
-    rl = list(repo)
+    rl = _safesubset(repo)
     anc = None
 
     # (getset(repo, rl, i) for i in l) generates a list of lists
@@ -305,7 +317,7 @@
     return []
 
 def _ancestors(repo, subset, x, followfirst=False):
-    args = getset(repo, list(repo), x)
+    args = getset(repo, _safesubset(repo), x)
     if not args:
         return []
     s = set(_revancestors(repo, args, followfirst)) | set(args)
@@ -432,7 +444,7 @@
         else:
             return [r for r in subset if matcher(repo[r].branch())]
 
-    s = getset(repo, list(repo), x)
+    s = getset(repo, _safesubset(repo), x)
     b = set()
     for r in s:
         b.add(repo[r].branch())
@@ -511,7 +523,7 @@
     """``children(set)``
     Child changesets of changesets in set.
     """
-    s = set(getset(repo, list(repo), x))
+    s = set(getset(repo, _safesubset(repo), x))
     cs = _children(repo, subset, s)
     return [r for r in subset if r in cs]
 
@@ -592,7 +604,7 @@
     return l
 
 def _descendants(repo, subset, x, followfirst=False):
-    args = getset(repo, list(repo), x)
+    args = getset(repo, _safesubset(repo), x)
     if not args:
         return []
     s = set(_revdescendants(repo, args, followfirst)) | set(args)
@@ -616,9 +628,9 @@
     is the same as passing all().
     """
     if x is not None:
-        args = set(getset(repo, list(repo), x))
+        args = set(getset(repo, _safesubset(repo), x))
     else:
-        args = set(getall(repo, list(repo), x))
+        args = set(getall(repo, _safesubset(repo), x))
 
     dests = set()
 
@@ -932,7 +944,7 @@
         # i18n: "limit" is a keyword
         raise error.ParseError(_("limit expects a number"))
     ss = set(subset)
-    os = getset(repo, list(repo), l[0])[:lim]
+    os = getset(repo, _safesubset(repo), l[0])[:lim]
     return [r for r in os if r in ss]
 
 def last(repo, subset, x):
@@ -950,14 +962,14 @@
         # i18n: "last" is a keyword
         raise error.ParseError(_("last expects a number"))
     ss = set(subset)
-    os = getset(repo, list(repo), l[0])[-lim:]
+    os = getset(repo, _safesubset(repo), l[0])[-lim:]
     return [r for r in os if r in ss]
 
 def maxrev(repo, subset, x):
     """``max(set)``
     Changeset with highest revision number in set.
     """
-    os = getset(repo, list(repo), x)
+    os = getset(repo, _safesubset(repo), x)
     if os:
         m = max(os)
         if m in subset:
@@ -994,7 +1006,7 @@
     """``min(set)``
     Changeset with lowest revision number in set.
     """
-    os = getset(repo, list(repo), x)
+    os = getset(repo, _safesubset(repo), x)
     if os:
         m = min(os)
         if m in subset:
@@ -1044,9 +1056,9 @@
     for the first operation is selected.
     """
     if x is not None:
-        args = set(getset(repo, list(repo), x))
+        args = set(getset(repo, _safesubset(repo), x))
     else:
-        args = set(getall(repo, list(repo), x))
+        args = set(getall(repo, _safesubset(repo), x))
 
     def _firstsrc(rev):
         src = _getrevsource(repo, rev)
@@ -1096,7 +1108,7 @@
 
     ps = set()
     cl = repo.changelog
-    for r in getset(repo, list(repo), x):
+    for r in getset(repo, _safesubset(repo), x):
         ps.add(cl.parentrevs(r)[0])
     return [r for r in subset if r in ps]
 
@@ -1114,7 +1126,7 @@
 
     ps = set()
     cl = repo.changelog
-    for r in getset(repo, list(repo), x):
+    for r in getset(repo, _safesubset(repo), x):
         ps.add(cl.parentrevs(r)[1])
     return [r for r in subset if r in ps]
 
@@ -1128,7 +1140,7 @@
 
     ps = set()
     cl = repo.changelog
-    for r in getset(repo, list(repo), x):
+    for r in getset(repo, _safesubset(repo), x):
         ps.update(cl.parentrevs(r))
     return [r for r in subset if r in ps]
 


More information about the Mercurial-devel mailing list