[PATCH] patch: improve handling of filenames containing spaces on import (issue3723)
Pierre-Yves David
pierre-yves.david at logilab.fr
Mon Dec 10 05:20:37 CST 2012
On Mon, Dec 10, 2012 at 09:50:29AM +0000, Alastair Houghton wrote:
> # HG changeset patch
> # User Alastair Houghton <alastair at coriolis-systems.com>
> # Date 1355130961 0
> # Node ID dd56046b73acdb976ff98e91469c36494b1ddd94
> # Parent 40374059d227850ec2f5fb4f21a1b619136e2a6a
> patch: improve handling of filenames containing spaces on import (issue3723)
>
> When reading git-format patches, it's possible that we will come across
> a line like:
>
> diff --git a/foo b/foo.txt b/foo b/foo.txt
>
> If the git-format diff in question is a rename or a copy, all is fine,
> because we can reliably read the filenames from subsequent git headers.
> However, if it turns out to be some other kind of diff, we might need
> to extract the filename(s) from a line in this format; this is especially
> the case when dealing with a binary diff, as git's binary diff format
> doesn't repeat the filenames at all.
>
> Fortunately, there is a heuristic we can use to render the above
> parseable, namely that the two filenames must be the same (since
> otherwise, we're looking at a rename or a copy and we already covered
> that case).
>
> This patch fixes iterhunks(), readgitpatch() and diffstatdata() to use
> this approach.
Bellow is some comment on your patches, some of them impact code that you just
move around. Feel free to either sent a prelude patch that clean that up or
ignore them.
This is not a LGTM as the whole function is still pretty confusing to me. I'll
try to get a second deeper pass soon.
> diff --git a/mercurial/patch.py b/mercurial/patch.py
> --- a/mercurial/patch.py
> +++ b/mercurial/patch.py
> @@ -15,6 +15,7 @@
> import context
>
> gitre = re.compile('diff --git a/(.*) b/(.*)')
> +gitre_no_namechange = re.compile(r'diff --git a/(.*) b/\1$')
You want a ^ in front of this regex. Otherwise pathological case forged by some
leprechaun might still bite you'
note: I love how greedy a/(.*) ensure consistency of b/\1$ here.
>
> class PatchError(Exception):
> pass
> @@ -270,7 +271,8 @@
> 'islink' is True if the file is a symlink and 'isexec' is True if
> the file is executable. Otherwise, 'mode' is None.
> """
> - def __init__(self, path):
> + def __init__(self, path, diffline=None):
> + self.diffline = diffline
> self.path = path
> self.oldpath = None
> self.mode = None
> @@ -283,13 +285,26 @@
> self.mode = (islink, isexec)
>
> def copy(self):
> - other = patchmeta(self.path)
> + other = patchmeta(self.path, diffline=self.diffline)
> other.oldpath = self.oldpath
> other.mode = self.mode
> other.op = self.op
> other.binary = self.binary
> return other
>
> + def reparse(self):
> + if self.op == 'RENAME' or self.op == 'COPY':
> + return True
> +
> + # reparse the diffline with a stricter regex
> + m = gitre_no_namechange.match(self.diffline)
> + if not m:
> + return False
> +
> + self.path = m.group(1)
> +
> + return True
> +
What's the purpose of this function ? can you add a small docstring to it ?
> def _ispatchinga(self, afile):
> if afile == '/dev/null':
> return self.op == 'ADD'
> @@ -317,16 +332,16 @@
> if line.startswith('diff --git'):
> m = gitre.match(line)
> if m:
> - if gp:
> + if gp and gp.reparse():
> gitpatches.append(gp)
> dst = m.group(2)
> - gp = patchmeta(dst)
> + gp = patchmeta(dst, diffline=line)
> elif gp:
> if line.startswith('--- '):
> - gitpatches.append(gp)
> + if gp.reparse():
> + gitpatches.append(gp)
> gp = None
> - continue
> - if line.startswith('rename from '):
> + elif line.startswith('rename from '):
> gp.op = 'RENAME'
> gp.oldpath = line[12:]
> elif line.startswith('rename to '):
> @@ -345,7 +360,11 @@
> gp.setmode(int(line[-6:], 8))
> elif line.startswith('GIT binary patch'):
> gp.binary = True
> - if gp:
> + if gp.reparse():
> + gitpatches.append(gp)
> + gp = None
> +
> + if gp and gp.reparse():
> gitpatches.append(gp)
>
> return gitpatches
> @@ -1189,6 +1208,11 @@
>
> # our states
> BFILE = 1
> +
> + scanninggit = False
> + diffline = None
> + gitop = None
> +
I would add some comment that explain what those state are about and how they
impact the process.
> context = None
> lr = linereader(fp)
>
> @@ -1196,6 +1220,51 @@
> x = lr.readline()
> if not x:
> break
> +
> + if scanninggit:
> + # git format patch headers end at --- or GIT binary patch
> + if x.startswith('---') or x.startswith('GIT binary patch'):
> + scanninggit = False
> +
> + ok = True
> + if gitop != 'RENAME' and gitop != 'COPY':
nitpick: if gitop not in ('RENAME', 'COPY'):
> + # use the stricter regex
> + m = gitre_no_namechange.match(diffline)
> + if m:
> + afile = 'a/' + m.group(1)
> + bfile = 'b/' + m.group(1)
> + else:
> + ok = False
> +
> + if ok:
> + while gitpatches \
> + and not gitpatches[-1].ispatching(afile, bfile):
> + gp = gitpatches.pop()
> + yield 'file', ('a/' + gp.path, 'b/' + gp.path, None,
> + gp.copy())
Sound like a for loop with a break and an else clause.
(yep, it looks like code movement, but that also a good opportunity
> + if not gitpatches:
> + raise PatchError(_('failed to synchronize metadata '
> + 'for "%s"')
> + % afile[2:])
> + gp = gitpatches[-1]
> + newfile = True
> + elif x.startswith('rename from '):
> + gitop = 'RENAME'
> + afile = 'a/' + x[12:].rstrip(' \r\n')
> + elif x.startswith('rename to '):
> + bfile = 'b/' + x[10:].rstrip(' \r\n')
> + elif x.startswith('copy from '):
> + gitop = 'COPY'
> + afile = 'a/' + x[10:].rstrip(' \r\n')
> + elif x.startswith('copy to '):
> + bfile = 'b/' + x[8:].rstrip(' \r\n')
if looks like adding the 'a/' and 'b/' prefix later would help readability here
(applies to `gitre_no_namechange` handling too)
Is that possible ?
> +
> + if newfile:
> + newfile = False
> + emitfile = True
> + state = BFILE
> + hunknum = 0
> +
> if state == BFILE and (
> (not context and x[0] == '@')
> or (context is not False and x.startswith('***************'))
> @@ -1216,25 +1285,26 @@
> yield 'file', (afile, bfile, h, gp and gp.copy() or None)
> yield 'hunk', h
> elif x.startswith('diff --git'):
> - m = gitre.match(x.rstrip(' \r\n'))
> + xs = x.rstrip(' \r\n')
> + m = gitre.match(xs)
> if not m:
> continue
> +
> if gitpatches is None:
> # scan whole input for git metadata
> gitpatches = scangitpatch(lr, x)
> yield 'git', [g.copy() for g in gitpatches
> if g.op in ('COPY', 'RENAME')]
> gitpatches.reverse()
> +
> + # defer processing of git metadata until we know for certain
> + # the names of the files (we need to see the metadata for this
> + # patch before we really know what they are)
> + scanninggit = True
> + diffline = xs
> afile = 'a/' + m.group(1)
> bfile = 'b/' + m.group(2)
> - while gitpatches and not gitpatches[-1].ispatching(afile, bfile):
> - gp = gitpatches.pop()
> - yield 'file', ('a/' + gp.path, 'b/' + gp.path, None, gp.copy())
> - if not gitpatches:
> - raise PatchError(_('failed to synchronize metadata for "%s"')
> - % afile[2:])
> - gp = gitpatches[-1]
> - newfile = True
> + gitop = 'MODIFY'
> elif x.startswith('---'):
> # check for a unified diff
> l2 = lr.readline()
> @@ -1261,11 +1331,6 @@
> afile = parsefilename(x)
> bfile = parsefilename(l2)
>
> - if newfile:
> - newfile = False
> - emitfile = True
> - state = BFILE
> - hunknum = 0
>
> while gitpatches:
> gp = gitpatches.pop()
> @@ -1818,10 +1883,18 @@
> # set numbers to 0 anyway when starting new file
> adds, removes, isbinary = 0, 0, False
> if line.startswith('diff --git'):
> - filename = gitre.search(line).group(1)
> + m = gitre_no_namechange.search(line)
> + if m:
> + filename = m.group(1)
> + else:
> + filename = None
> elif line.startswith('diff -r'):
> # format: "diff -r ... -r ... filename"
> filename = diffre.search(line).group(1)
> + elif line.startswith('rename from '):
> + filename = line[12:].rstrip(' \r\n')
> + elif line.startswith('copy from '):
> + filename = line[10:].rstrip(' \r\n')
I already met those "line.startswith('rename from ')" can they be factored ?
> elif line.startswith('+') and not line.startswith('+++ '):
> adds += 1
> elif line.startswith('-') and not line.startswith('--- '):
> @@ -1829,6 +1902,7 @@
> elif (line.startswith('GIT binary patch') or
> line.startswith('Binary file')):
> isbinary = True
> +
> addresult()
> return results
>
> _______________________________________________
> Mercurial-devel mailing list
> Mercurial-devel at selenic.com
> http://selenic.com/mailman/listinfo/mercurial-devel
--
Pierre-Yves David
http://www.logilab.fr/
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://selenic.com/pipermail/mercurial-devel/attachments/20121210/afa8787f/attachment.pgp>
More information about the Mercurial-devel
mailing list