D7910: rust-re2: add wrapper for calling Re2 from Rust

Alphare (Raphaël Gomès) phabricator at mercurial-scm.org
Thu Jan 16 16:52:21 EST 2020


Alphare marked an inline comment as done.
Alphare updated this revision to Diff 19386.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D7910?vs=19360&id=19386

BRANCH
  default

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7910/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7910

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/build.rs
  rust/hg-core/src/lib.rs
  rust/hg-core/src/re2/mod.rs
  rust/hg-core/src/re2/re2.rs
  rust/hg-core/src/re2/rust_re2.cpp
  rust/hg-cpython/Cargo.toml
  setup.py

CHANGE DETAILS

diff --git a/setup.py b/setup.py
--- a/setup.py
+++ b/setup.py
@@ -1355,10 +1355,19 @@
             env['HOME'] = pwd.getpwuid(os.getuid()).pw_dir
 
         cargocmd = ['cargo', 'rustc', '-vv', '--release']
+
+        feature_flags = []
+
         if sys.version_info[0] == 3 and self.py3_features is not None:
-            cargocmd.extend(
-                ('--features', self.py3_features, '--no-default-features')
-            )
+            feature_flags.append(self.py3_features)
+            cargocmd.append('--no-default-features')
+
+        rust_features = env.get("HG_RUST_FEATURES")
+        if rust_features:
+            feature_flags.append(rust_features)
+
+        cargocmd.extend(('--features', " ".join(feature_flags)))
+
         cargocmd.append('--')
         if sys.platform == 'darwin':
             cargocmd.extend(
diff --git a/rust/hg-cpython/Cargo.toml b/rust/hg-cpython/Cargo.toml
--- a/rust/hg-cpython/Cargo.toml
+++ b/rust/hg-cpython/Cargo.toml
@@ -10,6 +10,7 @@
 
 [features]
 default = ["python27"]
+with-re2 = ["hg-core/with-re2"]
 
 # Features to build an extension module:
 python27 = ["cpython/python27-sys", "cpython/extension-module-2-7"]
@@ -21,7 +22,7 @@
 python3-bin = ["cpython/python3-sys"]
 
 [dependencies]
-hg-core = { path = "../hg-core" }
+hg-core = { path = "../hg-core"}
 libc = '*'
 
 [dependencies.cpython]
diff --git a/rust/hg-core/src/re2/rust_re2.cpp b/rust/hg-core/src/re2/rust_re2.cpp
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/re2/rust_re2.cpp
@@ -0,0 +1,49 @@
+/*
+rust_re2.cpp
+
+C ABI export of Re2's C++ interface for Rust FFI.
+
+Copyright 2020 Valentin Gatien-Baron
+
+This software may be used and distributed according to the terms of the
+GNU General Public License version 2 or any later version.
+*/
+
+#include <re2/re2.h>
+using namespace re2;
+
+extern "C" {
+	RE2* rust_re2_create(const char* data, size_t len) {
+		RE2::Options o;
+		o.set_encoding(RE2::Options::Encoding::EncodingLatin1);
+		o.set_log_errors(false);
+		o.set_max_mem(50000000);
+
+		return new RE2(StringPiece(data, len), o);
+	}
+
+	void rust_re2_destroy(RE2* re) {
+		delete re;
+	}
+
+	bool rust_re2_ok(RE2* re) {
+		return re->ok();
+	}
+
+	void rust_re2_error(RE2* re, const char** outdata, size_t* outlen) {
+		const std::string& e = re->error();
+		*outdata = e.data();
+		*outlen = e.length();
+	}
+
+	bool rust_re2_match(RE2* re, char* data, size_t len, int ianchor) {
+		const StringPiece sp = StringPiece(data, len);
+
+		RE2::Anchor anchor =
+			ianchor == 0 ? RE2::Anchor::UNANCHORED :
+			(ianchor == 1 ? RE2::Anchor::ANCHOR_START :
+			 RE2::Anchor::ANCHOR_BOTH);
+
+		return re->Match(sp, 0, len, anchor, NULL, 0);
+	}
+}
diff --git a/rust/hg-core/src/re2/re2.rs b/rust/hg-core/src/re2/re2.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/re2/re2.rs
@@ -0,0 +1,66 @@
+/*
+re2.rs
+
+Rust FFI bindings to Re2.
+
+Copyright 2020 Valentin Gatien-Baron
+
+This software may be used and distributed according to the terms of the
+GNU General Public License version 2 or any later version.
+*/
+use libc::{c_int, c_void};
+
+type Re2Ptr = *const c_void;
+
+pub struct Re2(Re2Ptr);
+
+/// `re2.h` says:
+/// "An "RE2" object is safe for concurrent use by multiple threads."
+unsafe impl Sync for Re2 {}
+
+/// These bind to the C ABI in `rust_re2.cpp`.
+extern "C" {
+    fn rust_re2_create(data: *const u8, len: usize) -> Re2Ptr;
+    fn rust_re2_destroy(re2: Re2Ptr);
+    fn rust_re2_ok(re2: Re2Ptr) -> bool;
+    fn rust_re2_error(
+        re2: Re2Ptr,
+        outdata: *mut *const u8,
+        outlen: *mut usize,
+    ) -> bool;
+    fn rust_re2_match(
+        re2: Re2Ptr,
+        data: *const u8,
+        len: usize,
+        anchor: c_int,
+    ) -> bool;
+}
+
+impl Re2 {
+    pub fn new(pattern: &[u8]) -> Result<Re2, String> {
+        unsafe {
+            let re2 = rust_re2_create(pattern.as_ptr(), pattern.len());
+            if rust_re2_ok(re2) {
+                Ok(Re2(re2))
+            } else {
+                let mut data: *const u8 = std::ptr::null();
+                let mut len: usize = 0;
+                rust_re2_error(re2, &mut data, &mut len);
+                Err(String::from_utf8_lossy(std::slice::from_raw_parts(
+                    data, len,
+                ))
+                .to_string())
+            }
+        }
+    }
+
+    pub fn is_match(&self, data: &[u8]) -> bool {
+        unsafe { rust_re2_match(self.0, data.as_ptr(), data.len(), 1) }
+    }
+}
+
+impl Drop for Re2 {
+    fn drop(&mut self) {
+        unsafe { rust_re2_destroy(self.0) }
+    }
+}
diff --git a/rust/hg-core/src/re2/mod.rs b/rust/hg-core/src/re2/mod.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/re2/mod.rs
@@ -0,0 +1,21 @@
+/// re2 module
+///
+/// The Python implementation of Mercurial uses the Re2 regex engine when
+/// possible and if the bindings are installed, falling back to Python's `re`
+/// in case of unsupported syntax (Re2 is a non-backtracking engine).
+///
+/// Using it from Rust is not ideal. We need C++ bindings, a C++ compiler,
+/// Re2 needs to be installed... why not just use the `regex` crate?
+///
+/// Using Re2 from the Rust implementation guarantees backwards compatibility.
+/// We know it will work out of the box without needing to figure out the
+/// subtle differences in syntax. For example, `regex` currently does not
+/// support empty alternations (regex like `a||b`) which happens more often
+/// than we might think. Old benchmarks also showed worse performance from
+/// regex than with Re2, but the methodology and results were lost, so take
+/// this with a grain of salt.
+///
+/// The idea is to use Re2 for now as a temporary phase and then investigate
+/// how much work would be needed to use `regex`.
+mod re2;
+pub use re2::Re2;
diff --git a/rust/hg-core/src/lib.rs b/rust/hg-core/src/lib.rs
--- a/rust/hg-core/src/lib.rs
+++ b/rust/hg-core/src/lib.rs
@@ -21,6 +21,8 @@
 pub mod matchers;
 pub mod revlog;
 pub use revlog::*;
+#[cfg(feature = "with-re2")]
+pub mod re2;
 pub mod utils;
 
 use crate::utils::hg_path::{HgPathBuf, HgPathError};
diff --git a/rust/hg-core/build.rs b/rust/hg-core/build.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/build.rs
@@ -0,0 +1,25 @@
+// build.rs
+//
+// Copyright 2020 Raphaël Gomès <rgomes at octobus.net>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+#[cfg(feature = "with-re2")]
+use cc;
+
+#[cfg(feature = "with-re2")]
+fn compile_re2() {
+    cc::Build::new()
+        .cpp(true)
+        .flag("-std=c++11")
+        .file("src/re2/rust_re2.cpp")
+        .compile("librustre.a");
+
+    println!("cargo:rustc-link-lib=re2");
+}
+
+fn main() {
+    #[cfg(feature = "with-re2")]
+    compile_re2();
+}
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -4,6 +4,7 @@
 authors = ["Georges Racinet <gracinet at anybox.fr>"]
 description = "Mercurial pure Rust core library, with no assumption on Python bindings (FFI)"
 edition = "2018"
+build = "build.rs"
 
 [lib]
 name = "hg"
@@ -11,6 +12,7 @@
 [dependencies]
 byteorder = "1.3.1"
 lazy_static = "1.3.0"
+libc = { version = "0.2.66", optional = true }
 memchr = "2.2.0"
 rand = "0.6.5"
 rand_pcg = "0.1.1"
@@ -21,4 +23,11 @@
 
 [dev-dependencies]
 tempfile = "3.1.0"
-pretty_assertions = "0.6.1"
\ No newline at end of file
+pretty_assertions = "0.6.1"
+
+[build-dependencies]
+cc = { version = "1.0.48", optional = true }
+
+[features]
+default = []
+with-re2 = ["cc", "libc"]
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -49,6 +49,11 @@
 ]
 
 [[package]]
+name = "cc"
+version = "1.0.50"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
 name = "cfg-if"
 version = "0.1.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -66,7 +71,7 @@
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "num-traits 0.2.8 (registry+https://github.com/rust-lang/crates.io-index)",
  "python27-sys 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "python3-sys 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -141,7 +146,7 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "wasi 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
@@ -150,7 +155,9 @@
 version = "0.1.0"
 dependencies = [
  "byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "cc 1.0.50 (registry+https://github.com/rust-lang/crates.io-index)",
  "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "pretty_assertions 0.6.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -168,7 +175,7 @@
 dependencies = [
  "cpython 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "hg-core 0.1.0",
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -176,7 +183,7 @@
 version = "0.1.0"
 dependencies = [
  "hg-core 0.1.0",
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -186,7 +193,7 @@
 
 [[package]]
 name = "libc"
-version = "0.2.64"
+version = "0.2.66"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
@@ -220,7 +227,7 @@
 version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -260,7 +267,7 @@
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
@@ -269,7 +276,7 @@
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
@@ -287,7 +294,7 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "autocfg 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)",
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_chacha 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_core 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_hc 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -305,7 +312,7 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "getrandom 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)",
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_chacha 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_core 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_hc 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -379,7 +386,7 @@
 version = "0.1.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_core 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
@@ -391,7 +398,7 @@
 dependencies = [
  "cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)",
  "fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_core 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -523,7 +530,7 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
  "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
- "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)",
  "remove_dir_all 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
@@ -591,6 +598,7 @@
 "checksum bitflags 1.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693"
 "checksum byteorder 1.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a7c3dd8985a7111efc5c80b44e23ecdd8c007de8ade3b96595387e812b957cf5"
 "checksum c2-chacha 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7d64d04786e0f528460fc884753cf8dddcc466be308f6026f8e355c41a0e4101"
+"checksum cc 1.0.50 (registry+https://github.com/rust-lang/crates.io-index)" = "95e28fa049fda1c330bcf9d723be7663a899c4679724b34c81e9f5a326aab8cd"
 "checksum cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)" = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822"
 "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
 "checksum cpython 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "85532c648315aeb0829ad216a6a29aa3212cf9319bc7f6daf1404aa0bdd1485f"
@@ -604,7 +612,7 @@
 "checksum fuchsia-cprng 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "a06f77d526c1a601b7c4cdd98f54b5eaabffc14d5f2f0296febdc7f357c6d3ba"
 "checksum getrandom 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)" = "473a1265acc8ff1e808cd0a1af8cee3c2ee5200916058a2ca113c29f2d903571"
 "checksum lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
-"checksum libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)" = "74dfca3d9957906e8d1e6a0b641dc9a59848e793f1da2165889fd4f62d10d79c"
+"checksum libc 0.2.66 (registry+https://github.com/rust-lang/crates.io-index)" = "d515b1f41455adea1313a4a2ac8a8a477634fbae63cc6100e3aebb207ce61558"
 "checksum memchr 2.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "88579771288728879b57485cc7d6b07d648c9f0141eb955f8ab7f9d45394468e"
 "checksum memoffset 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ce6075db033bbbb7ee5a0bbd3a3186bbae616f57fb001c485c7ff77955f8177f"
 "checksum nodrop 0.1.14 (registry+https://github.com/rust-lang/crates.io-index)" = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"



To: Alphare, #hg-reviewers, kevincox
Cc: durin42, kevincox, mjpieters, mercurial-devel


More information about the Mercurial-devel mailing list