[PATCH 3 of 5] encoding: add function to test if a str consists of ASCII characters
Yuya Nishihara
yuya at tcha.org
Fri Aug 18 10:14:11 EDT 2017
# HG changeset patch
# User Yuya Nishihara <yuya at tcha.org>
# Date 1492919982 -32400
# Sun Apr 23 12:59:42 2017 +0900
# Node ID 394f90f7dacb585c23ec5efa8958c37620ce2418
# Parent a72635d28242bec2c5165ef938bc4abc4a23e7b4
encoding: add function to test if a str consists of ASCII characters
Most strings are ASCII. Let's optimize for it.
Using uint64_t is slightly faster than uint32_t on 64bit system, but there
isn't huge difference.
diff --git a/contrib/python3-whitelist b/contrib/python3-whitelist
--- a/contrib/python3-whitelist
+++ b/contrib/python3-whitelist
@@ -18,6 +18,7 @@ test-doctest.py
test-duplicateoptions.py
test-empty-dir.t
test-empty.t
+test-encoding-func.py
test-excessive-merge.t
test-hghave.t
test-issue1089.t
diff --git a/mercurial/cext/charencode.c b/mercurial/cext/charencode.c
--- a/mercurial/cext/charencode.c
+++ b/mercurial/cext/charencode.c
@@ -12,6 +12,7 @@
#include <assert.h>
#include "charencode.h"
+#include "compat.h"
#include "util.h"
#ifdef IS_PY3K
@@ -125,6 +126,29 @@ PyObject *unhexlify(const char *str, Py_
return ret;
}
+PyObject *isasciistr(PyObject *self, PyObject *args)
+{
+ const char *buf;
+ Py_ssize_t i, len;
+ if (!PyArg_ParseTuple(args, "s#:isasciistr", &buf, &len))
+ return NULL;
+ i = 0;
+ /* char array in PyStringObject should be at least 4-byte aligned */
+ if (((uintptr_t)buf & 3) == 0) {
+ const uint32_t *p = (const uint32_t *)buf;
+ for (; i < len / 4; i++) {
+ if (p[i] & 0x80808080U)
+ Py_RETURN_FALSE;
+ }
+ i *= 4;
+ }
+ for (; i < len; i++) {
+ if (buf[i] & 0x80)
+ Py_RETURN_FALSE;
+ }
+ Py_RETURN_TRUE;
+}
+
static inline PyObject *_asciitransform(PyObject *str_obj,
const char table[128],
PyObject *fallback_fn)
diff --git a/mercurial/cext/charencode.h b/mercurial/cext/charencode.h
--- a/mercurial/cext/charencode.h
+++ b/mercurial/cext/charencode.h
@@ -19,6 +19,7 @@ enum normcase_spec {
};
PyObject *unhexlify(const char *str, Py_ssize_t len);
+PyObject *isasciistr(PyObject *self, PyObject *args);
PyObject *asciilower(PyObject *self, PyObject *args);
PyObject *asciiupper(PyObject *self, PyObject *args);
PyObject *make_file_foldmap(PyObject *self, PyObject *args);
diff --git a/mercurial/cext/parsers.c b/mercurial/cext/parsers.c
--- a/mercurial/cext/parsers.c
+++ b/mercurial/cext/parsers.c
@@ -696,6 +696,7 @@ static PyMethodDef methods[] = {
{"parse_manifest", parse_manifest, METH_VARARGS, "parse a manifest\n"},
{"parse_dirstate", parse_dirstate, METH_VARARGS, "parse a dirstate\n"},
{"parse_index2", parse_index2, METH_VARARGS, "parse a revlog index\n"},
+ {"isasciistr", isasciistr, METH_VARARGS, "check if an ASCII string\n"},
{"asciilower", asciilower, METH_VARARGS, "lowercase an ASCII string\n"},
{"asciiupper", asciiupper, METH_VARARGS, "uppercase an ASCII string\n"},
{"dict_new_presized", dict_new_presized, METH_VARARGS,
@@ -716,7 +717,7 @@ void dirs_module_init(PyObject *mod);
void manifest_module_init(PyObject *mod);
void revlog_module_init(PyObject *mod);
-static const int version = 2;
+static const int version = 3;
static void module_init(PyObject *mod)
{
diff --git a/mercurial/compat.h b/mercurial/compat.h
--- a/mercurial/compat.h
+++ b/mercurial/compat.h
@@ -7,8 +7,10 @@
#define inline __inline
#if defined(_WIN64)
typedef __int64 ssize_t;
+typedef unsigned __int64 uintptr_t;
#else
typedef int ssize_t;
+typedef unsigned int uintptr_t;
#endif
typedef signed char int8_t;
typedef short int16_t;
diff --git a/mercurial/encoding.py b/mercurial/encoding.py
--- a/mercurial/encoding.py
+++ b/mercurial/encoding.py
@@ -24,6 +24,7 @@ from .pure import (
charencode = policy.importmod(r'charencode')
+isasciistr = charencode.isasciistr
asciilower = charencode.asciilower
asciiupper = charencode.asciiupper
_jsonescapeu8fast = charencode.jsonescapeu8fast
diff --git a/mercurial/policy.py b/mercurial/policy.py
--- a/mercurial/policy.py
+++ b/mercurial/policy.py
@@ -75,7 +75,7 @@ def _importfrom(pkgname, modname):
(r'cext', r'diffhelpers'): 1,
(r'cext', r'mpatch'): 1,
(r'cext', r'osutil'): 1,
- (r'cext', r'parsers'): 2,
+ (r'cext', r'parsers'): 3,
}
# map import request to other package or module
diff --git a/mercurial/pure/charencode.py b/mercurial/pure/charencode.py
--- a/mercurial/pure/charencode.py
+++ b/mercurial/pure/charencode.py
@@ -13,6 +13,13 @@ from .. import (
pycompat,
)
+def isasciistr(s):
+ try:
+ s.decode('ascii')
+ return True
+ except UnicodeDecodeError:
+ return False
+
def asciilower(s):
'''convert a string to lowercase if ASCII
diff --git a/tests/test-encoding-func.py b/tests/test-encoding-func.py
new file mode 100644
--- /dev/null
+++ b/tests/test-encoding-func.py
@@ -0,0 +1,33 @@
+from __future__ import absolute_import
+
+import unittest
+
+from mercurial import (
+ encoding,
+)
+
+class IsasciistrTest(unittest.TestCase):
+ asciistrs = [
+ b'a',
+ b'ab',
+ b'abc',
+ b'abcd',
+ b'abcde',
+ b'abcdefghi',
+ b'abcd\0fghi',
+ ]
+
+ def testascii(self):
+ for s in self.asciistrs:
+ self.assertTrue(encoding.isasciistr(s))
+
+ def testnonasciichar(self):
+ for s in self.asciistrs:
+ for i in range(len(s)):
+ t = bytearray(s)
+ t[i] |= 0x80
+ self.assertFalse(encoding.isasciistr(bytes(t)))
+
+if __name__ == '__main__':
+ import silenttestrunner
+ silenttestrunner.main(__name__)
More information about the Mercurial-devel
mailing list