[PATCH V2] contrib: add a tool to detect cyclic references

Gregory Szorc gregory.szorc at gmail.com
Sun May 28 14:45:04 EDT 2017


On Sat, May 27, 2017 at 10:02 PM, Jun Wu <quark at fb.com> wrote:

> # HG changeset patch
> # User Jun Wu <quark at fb.com>
> # Date 1495946922 25200
> #      Sat May 27 21:48:42 2017 -0700
> # Node ID 869505c8d99d438f1c7d0775645d4ac28d70c15e
> # Parent  ad37c569ec8121a99e4cd9da52365b114b82e744
> # Available At https://bitbucket.org/quark-zju/hg-draft
> #              hg pull https://bitbucket.org/quark-zju/hg-draft -r
> 869505c8d99d
> contrib: add a tool to detect cyclic references
>
> This patch adds a simple tool to find out all cyclic references for
> CPython.
>
> The added test shows some common cases where cycles are created.
>

This seems reasonable. However, the tool as it stands isn't very usable
because you must include it in sys.path and import it. I'd favor adding
this as mercurial/leaks.py (or possibly adding to mercurial/profiling.py)
so it is trivial to import and use the context manager. That, or refactor
it as an extension and have the extension stash a reference to the context
manager under mercurial.util.leakdetect (or similar) - again so it is easy
to use.


>
> diff --git a/contrib/leakdetect.py b/contrib/leakdetect.py
> new file mode 100644
> --- /dev/null
> +++ b/contrib/leakdetect.py
> @@ -0,0 +1,147 @@
> +# leakdetect.py - CPython object cyclic references analysis
> +#
> +# Copyright 2017 Facebook, Inc.
> +#
> +# This software may be used and distributed according to the terms of the
> +# GNU General Public License version 2 or any later version.
> +
> +from __future__ import absolute_import
> +
> +import contextlib
> +import gc
> +import os
> +import platform
> +import sys
> +
> +def stronglyconnectedcomponents(vertices, edges):
> +    """yield strongly connected components in a directed graph"""
> +    ignored = set() # vertices outputted already
> +    stack = []      # vertice DFS stack
> +    index = {}      # stack[index[n]] == n; later visit: bigger index
> +    lowlink = {}    # see Tarjan's strongly connected components algorithm
> +    execstack = []  # enumerate execution stack to bypass Python stack
> limit
> +
> +    def visit(v):
> +        index[v] = lowlink[v] = len(stack)
> +        stack.append(v)
> +        execstack.append((checkroot, (v,)))
> +        for w in edges.get(v, []):
> +            execstack.append((updatelowlink, (v, w)))
> +            if w not in index:
> +                execstack.append((visit, (w,)))
> +
> +    def updatelowlink(v, w):
> +        lowlink[v] = min(lowlink[v], lowlink[w])
> +
> +    def checkroot(v):
> +        i = index[v]
> +        if lowlink[v] == i:
> +            component = stack[i:]
> +            del stack[i:]
> +            if component[0] not in ignored:
> +                ignored.update(component)
> +                return component
> +
> +    for v in vertices:
> +        if v not in ignored:
> +            index.clear()
> +            lowlink.clear()
> +            execstack.append((visit, (v,)))
> +            while execstack:
> +                func, args = execstack.pop()
> +                result = func(*args)
> +                if result is not None:
> +                    yield result
> +
> +def objectreferencegraph(objects):
> +    """return (vertices, edges), referenced relationship among objects
> +
> +    vertices contain object IDs.
> +    """
> +    idset = set(id(o) for o in objects)
> +    edges = {}
> +    for o in objects:
> +        edges[id(o)] = [id(r) for r in gc.get_referents(o) if id(r) in
> idset]
> +    return idset, edges
> +
> +def describe(obj):
> +    """short and easy to understand text to help identify an object
> +
> +    similar to repr(), but avoids huge outputs in all cases.
> +    """
> +    if isinstance(obj, tuple):
> +        return '<tuple (%s)>' % ', '.join(describe(v) for v in obj)
> +    elif isinstance(obj, type):
> +        return '<type %s>' % obj.__name__
> +    elif isinstance(obj, type(describe)): # function
> +        code = obj.func_code
> +        path = os.path.basename(code.co_filename)
> +        return ('<function %s@%s:%s>' % (obj.func_name, path,
> +                                         code.co_firstlineno))
> +    elif isinstance(obj, type(os.environ.get)): # instancemethod
> +        return repr(obj)
> +    else:
> +        if type(obj) in (list, set, dict):
> +            tryrepr = repr(obj)
> +            if len(tryrepr) < 70:
> +                return tryrepr
> +        cellcontent = getattr(obj, 'cell_contents', None)
> +        if cellcontent is not None:
> +            return '<cell %s>' % describe(cellcontent)
> +        typename = type(obj).__name__
> +        if typename == 'getset_descriptor':
> +            return repr(obj)
> +        return '<%s>' % typename
> +
> +def analysegarbage(garbage, out):
> +    """find cyclic references among garbage objects and report them"""
> +    out.write('--- leak detection report ---\n'
> +              '%d objects not freed by reference counting\n' %
> len(garbage))
> +    vertices, edges = objectreferencegraph(garbage)
> +    groups = []
> +    for component in stronglyconnectedcomponents(vertices, edges):
> +        if len(component) == 1:
> +            v = component[0]
> +            if v in edges[v]: # self-cycle
> +                groups.append((v,))
> +        else:
> +            groups.append(component)
> +
> +    objmap = {id(o): o for o in garbage}
> +    outputs = []
> +    for g in groups:
> +        output = 'cycle of %d objects:\n' % len(g)
> +        output += ''.join(sorted('  %s\n' % describe(objmap[i]) for i in
> g))
> +        outputs.append(output)
> +    out.write(''.join(sorted(outputs)))
> +
> +if platform.python_implementation() == 'CPython':
> +    # only verified to work in CPython
> +    @contextlib.contextmanager
> +    def leakdetect(out=sys.stderr):
> +        # collect garbage so previous garbage won't affect us
> +        gc.set_debug(gc.get_debug() & ~gc.DEBUG_SAVEALL)
> +        gc.collect()
> +        # make gc.collect put objects to gc.garbage instead of releasing
> them
> +        gc.set_debug(gc.get_debug() | gc.DEBUG_SAVEALL)
> +        gc.disable()
> +        try:
> +            yield
> +        finally:
> +            # populate gc.garbage
> +            gc.collect()
> +            garbage = gc.garbage
> +            # restore the default behavior
> +            gc.set_debug(gc.get_debug() & ~gc.DEBUG_SAVEALL)
>

Wouldn't this be semantically better if it restored the original settings
stashed away from a gc.get_debug()?


> +            if garbage:
> +                analysegarbage(garbage, out)
> +                del gc.garbage[:]
> +
> +    disabled = False
> +else:
> +    # fallback to a no-op gracefully
> +    @contextlib.contextmanager
> +    def leakdetect(out=sys.stderr):
> +        yield
> +
> +    disabled = True
> diff --git a/tests/test-leakdetect.py b/tests/test-leakdetect.py
> new file mode 100644
> --- /dev/null
> +++ b/tests/test-leakdetect.py
> @@ -0,0 +1,50 @@
> +from __future__ import absolute_import
> +
> +import inspect
> +import os
> +import sys
> +
> +dirname = os.path.dirname
> +sys.path.insert(0, os.path.join(dirname(dirname(__file__)), 'contrib'))
> +
> +import leakdetect
> +
> +if leakdetect.disabled:
> +    sys.stdout.write('skipped: missing feature: cpython\n')
> +    sys.exit(80)
> +
> +def leaklistdict():
> +    a = []
> +    b = {}
> +    b['a'] = a
> +    a.append(b)
> +    a.append([[[[['65537']]]]])
> +    c = [('long text' * 100,)]
> +    c.append(c)
> +
> +def leakframe():
> +    frame = inspect.currentframe()
> +    a = []
> +    a.append(a)
> +    b = [[['some text']]]
> +
> +def leakfunc():
> +    def func(x):
> +        return func(x + 1)
> +    f = lambda x: g(x)
> +    g = lambda x: f(x)
> +
> +def leakclass():
> +    # http://bugs.python.org/msg60985
> +    class A(object):
> +        pass
> +    B = type('B', (object,), {})
> +    a = A()
> +    b = B()
> +    class C(a.__class__):
> +        pass
> +    a.__class__ = C
> +
> +for leak in [leaklistdict, leakframe, leakfunc, leakclass]:
> +    with leakdetect.leakdetect():
> +        leak()
> diff --git a/tests/test-leakdetect.py.out b/tests/test-leakdetect.py.out
> new file mode 100644
> --- /dev/null
> +++ b/tests/test-leakdetect.py.out
> @@ -0,0 +1,43 @@
> +--- leak detection report ---
> +9 objects not freed by reference counting
> +cycle of 1 objects:
> +  <list>
> +cycle of 2 objects:
> +  [{'a': [...]}, [[[[['65537']]]]]]
> +  {'a': [{...}, [[[[['65537']]]]]]}
> +--- leak detection report ---
> +5 objects not freed by reference counting
> +cycle of 1 objects:
> +  <frame>
> +cycle of 1 objects:
> +  [[...]]
> +--- leak detection report ---
> +9 objects not freed by reference counting
> +cycle of 3 objects:
> +  <cell <function func at test-leakdetect.py:32>>
> +  <function func at test-leakdetect.py:32>
> +  <tuple (<cell <function func at test-leakdetect.py:32>>)>
> +cycle of 6 objects:
> +  <cell <function <lambda>@test-leakdetect.py:34>>
> +  <cell <function <lambda>@test-leakdetect.py:35>>
> +  <function <lambda>@test-leakdetect.py:34>
> +  <function <lambda>@test-leakdetect.py:35>
> +  <tuple (<cell <function <lambda>@test-leakdetect.py:34>>)>
> +  <tuple (<cell <function <lambda>@test-leakdetect.py:35>>)>
> +--- leak detection report ---
> +15 objects not freed by reference counting
> +cycle of 2 objects:
> +  <tuple (<type C>, <type A>, <type object>)>
> +  <type C>
> +cycle of 5 objects:
> +  <attribute '__dict__' of 'A' objects>
> +  <attribute '__weakref__' of 'A' objects>
> +  <dict>
> +  <tuple (<type A>, <type object>)>
> +  <type A>
> +cycle of 5 objects:
> +  <attribute '__dict__' of 'B' objects>
> +  <attribute '__weakref__' of 'B' objects>
> +  <dict>
> +  <tuple (<type B>, <type object>)>
> +  <type B>
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at mercurial-scm.org
> https://www.mercurial-scm.org/mailman/listinfo/mercurial-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.mercurial-scm.org/pipermail/mercurial-devel/attachments/20170528/6909c965/attachment.html>


More information about the Mercurial-devel mailing list