[PATCH] convert/darcs: support changelogs with bytes 0x7F-0xFF (issue2411)

Brodie Rao brodie at bitheap.org
Fri Oct 1 10:17:22 CDT 2010


On Fri, Oct 1, 2010 at 11:15 AM, Brodie Rao <brodie at bitheap.org> wrote:
> # HG changeset patch
> # User Brodie Rao <brodie at bitheap.org>
> # Date 1285946104 18000
> # Branch stable
> # Node ID d0a4690bd027fcfd875726a764d4f1d8187fe0f8
> # Parent  10dcfba4f16d42f2fd80faf91818a0d6607591da
> convert/darcs: support changelogs with bytes 0x7F-0xFF (issue2411)
>
> This is a followup to 4481f8a93c7a, which only fixed the conversion of
> patches with UTF-8 metadata.
>
> This patch allows a changelog to have any bytes with values
> 0x7F-0xFF. It parses the XML changelog as Latin-1 and uses
> converter_source.recode() to decode the data as UTF-8/Latin-1.
>
> Caveats:
>
> - Since the convert extension doesn't provide any way to specify the
>  source encoding, users are still limited to UTF-8 and Latin-1.
>
> - etree will still complain if the changelog has bytes with values
>  0x00-0x19. XML only allows printable characters.

Whoops, this should probably say 0x80-0xFF. And this is meant for stable.

> diff --git a/hgext/convert/darcs.py b/hgext/convert/darcs.py
> --- a/hgext/convert/darcs.py
> +++ b/hgext/convert/darcs.py
> @@ -7,22 +7,22 @@
>
>  from common import NoRepo, checktool, commandline, commit, converter_source
>  from mercurial.i18n import _
> -from mercurial import util
> +from mercurial import encoding, util
>  import os, shutil, tempfile, re
>
>  # The naming drift of ElementTree is fun!
>
>  try:
> -    from xml.etree.cElementTree import ElementTree
> +    from xml.etree.cElementTree import ElementTree, XMLParser
>  except ImportError:
>     try:
> -        from xml.etree.ElementTree import ElementTree
> +        from xml.etree.ElementTree import ElementTree, XMLParser
>     except ImportError:
>         try:
> -            from elementtree.cElementTree import ElementTree
> +            from elementtree.cElementTree import ElementTree, XMLParser
>         except ImportError:
>             try:
> -                from elementtree.ElementTree import ElementTree
> +                from elementtree.ElementTree import ElementTree, XMLParser
>             except ImportError:
>                 ElementTree = None
>
> @@ -88,12 +88,24 @@ class darcs_source(converter_source, com
>         self.ui.debug('cleaning up %s\n' % self.tmppath)
>         shutil.rmtree(self.tmppath, ignore_errors=True)
>
> +    def recode(self, s, encoding=None):
> +        if isinstance(s, unicode):
> +            # XMLParser returns unicode objects for anything it can't
> +            # encode into ASCII. We convert them back to str to get
> +            # recode's normal conversion behavior.
> +            s = s.encode('latin-1')
> +        return super(darcs_source, self).recode(s, encoding)
> +
>     def xml(self, cmd, **kwargs):
>         # NOTE: darcs is currently encoding agnostic and will print
>         # patch metadata byte-for-byte, even in the XML changelog.
>         etree = ElementTree()
> +        # While we are decoding the XML as latin-1 to be as liberal as
> +        # possible, etree will still raise an exception if any
> +        # non-printable characters are in the XML changelog.
> +        parser = XMLParser(encoding='latin-1')
>         fp = self._run(cmd, **kwargs)
> -        etree.parse(fp)
> +        etree.parse(fp, parser=parser)
>         self.checkexit(fp.close())
>         return etree.getroot()
>
> diff --git a/tests/test-convert-darcs b/tests/test-convert-darcs
> --- a/tests/test-convert-darcs
> +++ b/tests/test-convert-darcs
> @@ -65,9 +65,15 @@ echo g > g
>  # darcs is encoding agnostic, so it takes whatever bytes it's given
>  darcs record -a -l -m 'p4: desc 帽' -A 'author 帽'
>
> +echo % test latin-1 commit message
> +echo h > h
> +printf "p5: desc " > ../p5
> +python -c 'print "".join([chr(i) for i in range(128, 256)])' >> ../p5
> +darcs record -a -l --logfile ../p5
> +
>  glog()
>  {
> -    HGENCODING=utf-8 hg glog --template '{rev} "{desc|firstline}" ({author}) files: {files}\n' "$@"
> +    hg glog --template '{rev} "{desc|firstline}" ({author}) files: {files}\n' "$@"
>  }
>
>  cd ..
> @@ -78,6 +84,7 @@ hg convert darcs-repo darcs-repo-hg
>  # Unfortunately, non-conflicting changes, like the addition of the
>  # "c" file in p1.1 patch are reverted too.
>  # Just to say that manifest not listing "c" here is a bug.
> -glog -R darcs-repo-hg
> +HGENCODING=latin-1 glog -R darcs-repo-hg -r 6 | "$TESTDIR"/printrepr.py
> +HGENCODING=utf-8 glog -R darcs-repo-hg -r 0:5 | "$TESTDIR"/printrepr.py
>  hg up -q -R darcs-repo-hg
>  hg -R darcs-repo-hg manifest --debug
> diff --git a/tests/test-convert-darcs.out b/tests/test-convert-darcs.out
> --- a/tests/test-convert-darcs.out
> +++ b/tests/test-convert-darcs.out
> @@ -16,17 +16,22 @@ Finished recording patch 'p2'
>  Finished recording patch 'p3'
>  % test utf-8 commit message and author
>  Finished recording patch 'p4: desc 帽'
> +% test latin-1 commit message
> +Finished recording patch 'p5: desc €亗儎厗噲墛媽崕彁憭摂晼棙櫄洔潪煚、¥ウЖ┆辈炒刀犯购患骄坷谅媚牌侨墒颂臀闲岩釉罩棕仝圮蒉哙徕沅彐玷殛腱眍镳耱篝貊鼬�'
>  initializing destination darcs-repo-hg repository
>  scanning source...
>  sorting...
>  converting...
> -5 p0
> -4 p1.2
> -3 p1.1
> -2 p2
> -1 p3
> -0 p4: desc ?
> -o  5 "p4: desc 帽" (author 帽) files: g
> +6 p0
> +5 p1.2
> +4 p1.1
> +3 p2
> +2 p3
> +1 p4: desc ?
> +0 p5: desc ????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
> +o  6 "p5: desc \x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" (test at example.org) files: h
> +|
> +o  5 "p4: desc \xc3\xb1" (author \xc3\xb1) files: g
>  |
>  o  4 "p3" (test at example.org) files: dir/d dir/d2 dir2/d f ff
>  |
> @@ -43,3 +48,4 @@ 1e88685f5ddec574a34c70af492f95b6debc8741
>  37406831adc447ec2385014019599dfec953c806 644   dir2/d
>  b783a337463792a5c7d548ad85a7d3253c16ba8c 644   ff
>  0973eb1b2ecc4de7fafe7447ce1b7462108b4848 644   g
> +fe6f8b4f507fe3eb524c527192a84920a4288dac 644   h
>


More information about the Mercurial-devel mailing list