[PATCH] py3: refactor token parsing to handle call args properly

Thu Oct 13 04:32:31 EDT 2016

On Thu, Oct 13, 2016 at 10:10 AM, Martijn Pieters <mj at zopatista.com> wrote:

> # HG changeset patch
> # User Martijn Pieters <mjpieters at fb.com>
> # Date 1476346188 -3600
> #      Thu Oct 13 09:09:48 2016 +0100
> # Node ID 81d23b9e2b329666db6e342f6bafec54a893687c
> # Parent  733fb9f7bc92c694ba6bededaeb93206528c0bcd
> py3: refactor token parsing to handle call args properly
>
> The token parsing was getting unwieldy and was too naive about accessing
> arguments.
>
> diff --git a/mercurial/__init__.py b/mercurial/__init__.py
> --- a/mercurial/__init__.py
> +++ b/mercurial/__init__.py
> @@ -185,6 +185,61 @@
>          OR CACHED FILES WON'T GET INVALIDATED PROPERLY.
>          """
>          futureimpline = False
> +
> +        # The following utility functions access the tokens list and i
> index of
> +        # the for i, t enumerate(tokens) loop below
> +        def _is_op(j, *o):
> +            """Assert that tokens[j] is an OP with one of the given
> values"""
> +            try:
> +                return tokens[j].type == token.OP and tokens[j].string in
> o
> +            except IndexError:
> +                return False
> +
> +        def _find_argn_of_call(n):
> +            """Find arg n of a call expression (start at 0)
> +
> +            Returns index of the first token of that argument, or None if
> +            there is not that many arguments.
> +
> +            Assumes that token[i + 1] is '('.
> +
> +            """
> +            j = i + 2
> +            nested = 0
> +            try:
> +                while True:
> +                    if _is_op(j, ')', ']', '}'):
> +                        # end of call, tuple, subscription or dict / set
> +                        nested -= 1
> +                        if nested < 0:
> +                            return None
> +                    elif n == 0:
> +                        # this is the starting position of arg
> +                        return j
> +                    elif _is_op(j, '(', '[', '{'):
> +                        nested += 1
> +                    elif _is_op(j, ',') and nested == 0:
> +                        n -= 1
> +                    j += 1
> +            except IndexError:
> +                return None
> +
> +        def _ensure_unicode(j):
> +            """Make sure the token at j is a unicode string
> +
> +            This rewrites a string token to include the unicode literal
> prefix
> +            so the string transformer won't add the byte prefix.
> +
> +            Ignores tokens that are not strings. Assumes bounds checking
> has
> +            already been done.
> +
> +            """
> +            st = tokens[j]
> +            if st.type == token.STRING and st.string.startswith(("'",
> '"')):
> +                rt = tokenize.TokenInfo(st.type, 'u%s' % st.string,
> +                                        st.start, st.end, st.line)
> +                tokens[j] = rt
> +
>          for i, t in enumerate(tokens):
>              # Convert most string literals to byte literals. String
> literals
>              # in Python 2 are bytes. String literals in Python 3 are
> unicode.
> @@ -241,91 +296,35 @@
>                                               '')
>                  continue
>
> -            try:
> -                nexttoken = tokens[i + 1]
> -            except IndexError:
> -                nexttoken = None
> -
> -            try:
> -                prevtoken = tokens[i - 1]
> -            except IndexError:
> -                prevtoken = None
> -
>              # This looks like a function call.
> -            if (t.type == token.NAME and nexttoken and
> -                nexttoken.type == token.OP and nexttoken.string == '('):
> +            if t.type == token.NAME and _is_op(i + 1, '('):
>                  fn = t.string
>
>                  # *attr() builtins don't accept byte strings to 2nd
> argument.
> -                # Rewrite the token to include the unicode literal prefix
> so
> -                # the string transformer above doesn't add the byte
> prefix.
> -                if fn in ('getattr', 'setattr', 'hasattr', 'safehasattr'):
> -                    try:
> -                        # (NAME, 'getattr')
> -                        # (OP, '(')
> -                        # (NAME, 'foo')
> -                        # (OP, ',')
> -                        # (NAME|STRING, foo)
> -                        st = tokens[i + 4]
> -                        if (st.type == token.STRING and
> -                            st.string[0] in ("'", '"')):
> -                            rt = tokenize.TokenInfo(st.type, 'u%s' %
> st.string,
> -                                                    st.start, st.end,
> st.line)
> -                            tokens[i + 4] = rt
> -                    except IndexError:
> -                        pass
> +                if (fn in ('getattr', 'setattr', 'hasattr',
> 'safehasattr') and
> +                        not _is_op(i - 1, '.')):
> +                    arg1idx = _find_argn_of_call(1)
> +                    if arg1idx is not None:
> +                        _ensure_unicode(arg1idx)
>
>                  # .encode() and .decode() on str/bytes/unicode don't
> accept
> -                # byte strings on Python 3. Rewrite the token to include
> the
> -                # unicode literal prefix so the string transformer above
> doesn't
> -                # add the byte prefix. The loop helps in handling multiple
> -                # arguments.
> -                if (fn in ('encode', 'decode') and
> -                    prevtoken.type == token.OP and prevtoken.string ==
> '.'):
> -                    # (OP, '.')
> -                    # (NAME, 'encode')
> -                    # (OP, '(')
> -                    # [(VARIABLE, encoding)]
> -                    # [(OP, '.')]
> -                    # [(VARIABLE, encoding)]
> -                    # [(OP, ',')]
> -                    # (STRING, 'utf-8')
> -                    # (OP, ')')
> -                    j = i
> -                    try:
> -                        while (tokens[j + 1].string in ('(', ',', '.')):
> -                            st = tokens[j + 2]
> -                            if (st.type == token.STRING and
> -                                st.string[0] in ("'", '"')):
> -                                rt = tokenize.TokenInfo(st.type,
> -                                                    'u%s' % st.string,
> -                                                    st.start, st.end,
> st.line)
> -                                tokens[j + 2] = rt
> -                            j = j + 2
> -                    except IndexError:
> -                        pass
> +                # byte strings on Python 3.
> +                elif fn in ('encode', 'decode') and _is_op(i - 1, '.'):
> +                    for argn in range(2):
> +                        argidx = _find_argn_of_call(argn)
> +                        if argidx is not None:
> +                            _ensure_unicode(argidx)
>
> -                # Bare open call (not an attribute on something else)
> -                if (fn == 'open' and not (prevtoken.type == token.OP and
> -                                          prevtoken.string == '.')):
> -                    try:
> -                        # (NAME, 'open')
> -                        # (OP, '(')
> -                        # (NAME|STRING, 'filename')
> -                        # (OP, ',')
> -                        # (NAME|STRING, mode)
> -                        st = tokens[i + 4]
> -                        if (st.type == token.STRING and
> -                                st.string[0] in ("'", '"')):
> -                            rt = tokenize.TokenInfo(st.type, 'u%s' %
> st.string,
> -                                                    st.start, st.end,
> st.line)
> -                            tokens[i + 4] = rt
> -                    except IndexError:
> -                        pass
> +                # Bare open call (not an attribute on something else), the
> +                # second argument (mode) must be a string, not bytes
> +                elif fn == 'open' and not _is_op(i - 1, '.'):
> +                    arg1idx = _find_argn_of_call(1)
> +                    if arg1idx is not None:
> +                        _ensure_unicode(arg1idx)
>
>                  # It changes iteritems to items as iteritems is not
>                  # present in Python 3 world.
> -                if fn == 'iteritems':
> +                elif fn == 'iteritems':
>                      yield tokenize.TokenInfo(t.type, 'items',
>                                               t.start, t.end, t.line)
>                      continue
> @@ -337,7 +336,7 @@
>      # ``replacetoken`` or any mechanism that changes semantics of module
>      # loading is changed. Otherwise cached bytecode may get loaded without
>      # the new transformation mechanisms applied.
> -    BYTECODEHEADER = b'HG\x00\x05'
> +    BYTECODEHEADER = b'HG\x00\x06'
>
>      class hgloader(importlib.machinery.SourceFileLoader):
>          """Custom module loader that transforms source code.
>

Nice refactor. While I haven't looked at the code in detail yet, does
check-code not complain about the use of underscore_function_names?
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://www.mercurial-scm.org/pipermail/mercurial-devel/attachments/20161013/9b0bc287/attachment.html>