[PATCH] patch: improve handling of filenames containing spaces on import (issue3723)

Mon Dec 10 05:20:37 CST 2012

On Mon, Dec 10, 2012 at 09:50:29AM +0000, Alastair Houghton wrote:
> # HG changeset patch
> # User Alastair Houghton <alastair at coriolis-systems.com>
> # Date 1355130961 0
> # Node ID dd56046b73acdb976ff98e91469c36494b1ddd94
> # Parent  40374059d227850ec2f5fb4f21a1b619136e2a6a
> patch: improve handling of filenames containing spaces on import (issue3723)
> 
> When reading git-format patches, it's possible that we will come across
> a line like:
> 
>   diff --git a/foo b/foo.txt b/foo b/foo.txt
> 
> If the git-format diff in question is a rename or a copy, all is fine,
> because we can reliably read the filenames from subsequent git headers.
> However, if it turns out to be some other kind of diff, we might need
> to extract the filename(s) from a line in this format; this is especially
> the case when dealing with a binary diff, as git's binary diff format
> doesn't repeat the filenames at all.
> 
> Fortunately, there is a heuristic we can use to render the above
> parseable, namely that the two filenames must be the same (since
> otherwise, we're looking at a rename or a copy and we already covered
> that case).
> 
> This patch fixes iterhunks(), readgitpatch() and diffstatdata() to use
> this approach.

Bellow is some comment on your patches, some of them impact code that you just
move around. Feel free to either sent a prelude patch that clean that up or
ignore them.

This is not a LGTM as the whole function is still pretty confusing to me. I'll
try to get a second deeper pass soon.

> diff --git a/mercurial/patch.py b/mercurial/patch.py
> --- a/mercurial/patch.py
> +++ b/mercurial/patch.py
> @@ -15,6 +15,7 @@
>  import context
>  
>  gitre = re.compile('diff --git a/(.*) b/(.*)')
> +gitre_no_namechange = re.compile(r'diff --git a/(.*) b/\1$')

You want a ^ in front of this regex. Otherwise pathological case forged by some
leprechaun might still bite you'

note: I love how greedy a/(.*) ensure consistency of b/\1$ here.

>  
>  class PatchError(Exception):
>      pass
> @@ -270,7 +271,8 @@
>      'islink' is True if the file is a symlink and 'isexec' is True if
>      the file is executable. Otherwise, 'mode' is None.
>      """
> -    def __init__(self, path):
> +    def __init__(self, path, diffline=None):
> +        self.diffline = diffline
>          self.path = path
>          self.oldpath = None
>          self.mode = None
> @@ -283,13 +285,26 @@
>          self.mode = (islink, isexec)
>  
>      def copy(self):
> -        other = patchmeta(self.path)
> +        other = patchmeta(self.path, diffline=self.diffline)
>          other.oldpath = self.oldpath
>          other.mode = self.mode
>          other.op = self.op
>          other.binary = self.binary
>          return other
>  
> +    def reparse(self):
> +        if self.op == 'RENAME' or self.op == 'COPY':
> +            return True
> +
> +        # reparse the diffline with a stricter regex
> +        m = gitre_no_namechange.match(self.diffline)
> +        if not m:
> +            return False
> +
> +        self.path = m.group(1)
> +
> +        return True
> +

What's the purpose of this function ? can you add a small docstring to it ?

>      def _ispatchinga(self, afile):
>          if afile == '/dev/null':
>              return self.op == 'ADD'
> @@ -317,16 +332,16 @@
>          if line.startswith('diff --git'):
>              m = gitre.match(line)
>              if m:
> -                if gp:
> +                if gp and gp.reparse():
>                      gitpatches.append(gp)
>                  dst = m.group(2)
> -                gp = patchmeta(dst)
> +                gp = patchmeta(dst, diffline=line)
>          elif gp:
>              if line.startswith('--- '):
> -                gitpatches.append(gp)
> +                if gp.reparse():
> +                    gitpatches.append(gp)
>                  gp = None
> -                continue
> -            if line.startswith('rename from '):
> +            elif line.startswith('rename from '):
>                  gp.op = 'RENAME'
>                  gp.oldpath = line[12:]
>              elif line.startswith('rename to '):
> @@ -345,7 +360,11 @@
>                  gp.setmode(int(line[-6:], 8))
>              elif line.startswith('GIT binary patch'):
>                  gp.binary = True
> -    if gp:
> +                if gp.reparse():
> +                    gitpatches.append(gp)
> +                gp = None
> +
> +    if gp and gp.reparse():
>          gitpatches.append(gp)
>  
>      return gitpatches
> @@ -1189,6 +1208,11 @@
>  
>      # our states
>      BFILE = 1
> +
> +    scanninggit = False
> +    diffline = None
> +    gitop = None
> +

I would add some comment that explain what those state are about and how they
impact the process.

>      context = None
>      lr = linereader(fp)
>  
> @@ -1196,6 +1220,51 @@
>          x = lr.readline()
>          if not x:
>              break
> +
> +        if scanninggit:
> +            # git format patch headers end at --- or GIT binary patch
> +            if x.startswith('---') or x.startswith('GIT binary patch'):
> +                scanninggit = False
> +
> +                ok = True
> +                if gitop != 'RENAME' and gitop != 'COPY':

nitpick: if gitop not in ('RENAME', 'COPY'):

> +                    # use the stricter regex
> +                    m = gitre_no_namechange.match(diffline)
> +                    if m:
> +                        afile = 'a/' + m.group(1)
> +                        bfile = 'b/' + m.group(1)
> +                    else:
> +                        ok = False
> +
> +                if ok:
> +                    while gitpatches \
> +                              and not gitpatches[-1].ispatching(afile, bfile):
> +                        gp = gitpatches.pop()
> +                        yield 'file', ('a/' + gp.path, 'b/' + gp.path, None,
> +                                       gp.copy())

Sound like a for loop with a break and an else clause.

(yep, it looks like code movement, but that also a good opportunity

> +                    if not gitpatches:
> +                        raise PatchError(_('failed to synchronize metadata '
> +                                           'for "%s"')
> +                                         % afile[2:])
> +                    gp = gitpatches[-1]
> +                    newfile = True
> +            elif x.startswith('rename from '):
> +                gitop = 'RENAME'
> +                afile = 'a/' + x[12:].rstrip(' \r\n')
> +            elif x.startswith('rename to '):
> +                bfile = 'b/' + x[10:].rstrip(' \r\n')
> +            elif x.startswith('copy from '):
> +                gitop = 'COPY'
> +                afile = 'a/' + x[10:].rstrip(' \r\n')
> +            elif x.startswith('copy to '):
> +                bfile = 'b/' + x[8:].rstrip(' \r\n')

if looks like adding the 'a/' and 'b/' prefix later would help readability here
(applies to `gitre_no_namechange` handling too)

Is that possible ?

> +
> +        if newfile:
> +            newfile = False
> +            emitfile = True
> +            state = BFILE
> +            hunknum = 0
> +
>          if state == BFILE and (
>              (not context and x[0] == '@')
>              or (context is not False and x.startswith('***************'))
> @@ -1216,25 +1285,26 @@
>                  yield 'file', (afile, bfile, h, gp and gp.copy() or None)
>              yield 'hunk', h
>          elif x.startswith('diff --git'):
> -            m = gitre.match(x.rstrip(' \r\n'))
> +            xs = x.rstrip(' \r\n')
> +            m = gitre.match(xs)
>              if not m:
>                  continue
> +
>              if gitpatches is None:
>                  # scan whole input for git metadata
>                  gitpatches = scangitpatch(lr, x)
>                  yield 'git', [g.copy() for g in gitpatches
>                                if g.op in ('COPY', 'RENAME')]
>                  gitpatches.reverse()
> +
> +            # defer processing of git metadata until we know for certain
> +            # the names of the files (we need to see the metadata for this
> +            # patch before we really know what they are)
> +            scanninggit = True
> +            diffline = xs
>              afile = 'a/' + m.group(1)
>              bfile = 'b/' + m.group(2)
> -            while gitpatches and not gitpatches[-1].ispatching(afile, bfile):
> -                gp = gitpatches.pop()
> -                yield 'file', ('a/' + gp.path, 'b/' + gp.path, None, gp.copy())
> -            if not gitpatches:
> -                raise PatchError(_('failed to synchronize metadata for "%s"')
> -                                 % afile[2:])
> -            gp = gitpatches[-1]
> -            newfile = True
> +            gitop = 'MODIFY'
>          elif x.startswith('---'):
>              # check for a unified diff
>              l2 = lr.readline()
> @@ -1261,11 +1331,6 @@
>              afile = parsefilename(x)
>              bfile = parsefilename(l2)
>  
> -        if newfile:
> -            newfile = False
> -            emitfile = True
> -            state = BFILE
> -            hunknum = 0
>  
>      while gitpatches:
>          gp = gitpatches.pop()
> @@ -1818,10 +1883,18 @@
>              # set numbers to 0 anyway when starting new file
>              adds, removes, isbinary = 0, 0, False
>              if line.startswith('diff --git'):
> -                filename = gitre.search(line).group(1)
> +                m = gitre_no_namechange.search(line)
> +                if m:
> +                    filename = m.group(1)
> +                else:
> +                    filename = None
>              elif line.startswith('diff -r'):
>                  # format: "diff -r ... -r ... filename"
>                  filename = diffre.search(line).group(1)
> +        elif line.startswith('rename from '):
> +            filename = line[12:].rstrip(' \r\n')
> +        elif line.startswith('copy from '):
> +            filename = line[10:].rstrip(' \r\n')

I already met those "line.startswith('rename from ')" can they be factored ?

>          elif line.startswith('+') and not line.startswith('+++ '):
>              adds += 1
>          elif line.startswith('-') and not line.startswith('--- '):
> @@ -1829,6 +1902,7 @@
>          elif (line.startswith('GIT binary patch') or
>                line.startswith('Binary file')):
>              isbinary = True
> +
>      addresult()
>      return results
>  
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel

-- 
Pierre-Yves David

http://www.logilab.fr/

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://selenic.com/pipermail/mercurial-devel/attachments/20121210/afa8787f/attachment.pgp>