[PATCH] Don't consider UTF-16 and UTF-32 files as binary (issue1975) (version 2)

Ben benoit.allard at gmx.de
Thu Feb 4 12:07:52 CST 2010


The generated patches works fine with GNU diff. I mainly needs that to
review my commits on Windows where some tools generate config files
automatically in UTF-16 ...

# HG changeset patch
# User Benoit Allard <benoit at aeteurope.nl>
# Date 1265306735 -3600
Don't consider UTF-16 and UTF-32 files as binary (issue1975)

diff --git a/mercurial/util.py b/mercurial/util.py
--- a/mercurial/util.py
+++ b/mercurial/util.py
@@ -14,7 +14,7 @@
 """

 from i18n import _
-import error, osutil, encoding
+import error, osutil, encoding, codecs
 import cStringIO, errno, re, shutil, sys, tempfile, traceback
 import os, stat, time, calendar, textwrap
 import imp
@@ -210,9 +210,20 @@
             return fn(s, cmd[len(name):].lstrip())
     return pipefilter(s, cmd)

+boms = [
+    codecs.BOM_UTF8,
+    codecs.BOM_UTF16_BE, codecs.BOM_UTF16_LE,
+    codecs.BOM_UTF32_BE, codecs.BOM_UTF32_LE
+    ]
+
 def binary(s):
     """return true if a string is binary data"""
-    return bool(s and '\0' in s)
+    if s:
+        for bom in boms:
+            if s.startswith(bom):
+                return False
+        return '\0' in s
+    return False

 def increasingchunks(source, min=1024, max=65536):
     '''return no less than min bytes per chunk while data remains,
diff --git a/tests/test-diff-binary-file b/tests/test-diff-binary-file
--- a/tests/test-diff-binary-file
+++ b/tests/test-diff-binary-file
@@ -23,3 +23,12 @@

 echo % diff --git -r 0 -r 2
 hg diff --git -r 0 -r 2
+
+echo % UTF-16 diff
+echo "\xff\xfe\xe4\x00\x0a\x00" > utf-16le
+echo "\x00\x00\xfe\xff\x00\x00\x00\x31\x00\x00\x00\x0a" > utf-32be
+hg add utf-16le utf-32be
+hg ci -m "add files"
+echo "\x31\x00\x0a\x00" >> utf-16le
+echo "\x00\x00\x00\x32\x00\x00\x00\x0a" >> utf-32be
+hg diff --nodates utf-16le utf-32be
diff --git a/tests/test-diff-binary-file.out b/tests/test-diff-binary-file.out
index 048f20400cae4202b0d5c43d344f8273d5a216c5..1fa8e52e8b1f5ff863f2a586185879fb168997ec
GIT binary patch
literal 1315
zc${@q*^;7I5cQd_pu#0eZF<;|P#Q`_g;Lzbss~F}K{gR76)C at VzN354h^eWGo_WaS
z<jpu4CnEC}LESb at 5ZZ(A%^|?w5U;5ig%C7F;`bLtGZa)&aiXTxK-Kn5+cmyt5op at J
zvOj}2{Iw2RN(5?_;%^KcDE at m2`dNTB_TjhT_oDL0S2Fc00&HJ5P9R393Z^3(X(AMY
zQ97*=CP7i8slYg_DJEeW6lR+A_wPha*Dyq(4b;S8lfq3C%Y at J{W>Pwd!fMSRR0<<V
zpexL78g6=a%0aUL*FG3~#RUl(2NJEol|{JGQx2nY!>VJ?Y2Sfi*gzz)KMS+W>)zZD
zjv at hdbX6E+5T+GMZF*U<oCC={z!TYzsq?`ZqRYq;1|4W_(KETe*bTPF_pF70B*$&M
zwrQ>-k0QpXQ<ugMxggCQ9Y~f1-+$>J3Q%G3z{ck)vM8>T5^phzmFh7+Y2fKtwze_0
zQCEJaQnOzI^(1$pFkHF}dL;%xQW1i7vmA9-EmMkvrWf{Si}|Wi=lo-D^ne`qvcA4N
zdcA;LNK5%}wl0dBgkIFEoL`dNdw&Kbef^~|W1J{G1{Sm~!N`NL%|D*h1!AT<p~4jF
zI#ldb_{uW>+FRp7k(^Ge1zv3PCGEh}b^(ysQvSleZBwURZ0TuWhkCR=d0agZp%p82
zv92EOBbJCOiBP}z!R8d5uB)`;uU^=$u91y8keCCpYIhKE-5;DvPbjPNM at g==TtiJp
zGgYuIT)~=>m9^935WUzBDDx@*AtB{JqShO)a9sdN_1He;%fn7M(D+OAAWATFvbr_l
z at U&ex;*r<XH73ihXwK0+TOa8~-J4WH&YSLSBBbAB2uPF(dgWV7wuv6A3EvNr793T`
z*K0cNuFMN2<Qr?YJf`N5_?m`<T&Kr1`;FN(<7pE6CuW`rp<88Q+YDZg^or$s=dt at Z
z_&|p4SZghRxXWf`*p^@zdo7I+>WwR7Bb%||qYC2<nSF=i59&GF3(p%rWokD3LpY&>
z+IGJ#GK%_EimgYNoiO|QY-@*sPa^^fSwhSb)Rn8?7G=@#aPpjIfX!Gqh%$7U1~ac*
zONo5mNpFh_!;<s9T*k1sH8Jt%L_L%Z_rQN2@$I(fvkrui_t($=1kqFtgLPaXND_<>
zX4c>u00cqVrGLU|wVIv!C*(K~LcXD375RpP90!2Ee*T#Qz#LFTGRyrxH- at TTx$p2F
b?mPUAJD2<UE0 at b5nE_zNp39-Pv<mzMgj9x;


More information about the Mercurial-devel mailing list