[PATCH 1 of 3] rust-filepatterns: add a Rust implementation of pattern-related utils

Raphaël Gomès raphael.gomes at octobus.net
Wed Apr 17 00:00:38 UTC 2019


# HG changeset patch
# User Raphaël Gomès <rgomes at octobus.net>
# Date 1555442271 -7200
#      Tue Apr 16 21:17:51 2019 +0200
# Node ID ad489b44ef16ae39eb2c692fc5ccf3268a140a65
# Parent  a362b0b95e42c8f7d46d7e3a0eb4cc531fa5f2d6
# EXP-Topic rust-filepatterns
rust-filepatterns: add a Rust implementation of pattern-related utils

This change introduces Rust implementations of two functions related to
pattern handling, all located in `match.py`:

    - `_regex`
    - `readpatternfile`

These utils are useful in the long-term effort to improve `hg status`'s
performance using Rust. Experimental work done by Valentin Gatien-Baron
shows very promising improvements, but is too different from the current
Mercurial core code structure to be used "as-is".
This is the first - albeit very small - step towards the code revamp
needed down the line.

Two dependencies were added: `regex` and `lazy_static`. Both of them
will be useful for a majority of the Rust code that will be written,
are well known and maintained either by the Rust core team, or by
very frequent contributors.

diff -r a362b0b95e42 -r ad489b44ef16 rust/Cargo.lock
--- a/rust/Cargo.lock	Fri Apr 05 14:35:33 2019 +0200
+++ b/rust/Cargo.lock	Tue Apr 16 21:17:51 2019 +0200
@@ -49,8 +49,10 @@
 name = "hg-core"
 version = "0.1.0"
 dependencies = [
+ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_pcg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -74,7 +76,7 @@
 
 [[package]]
 name = "lazy_static"
-version = "1.2.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
@@ -265,7 +267,7 @@
 version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -310,7 +312,7 @@
 "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
 "checksum cpython 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b489034e723e7f5109fecd19b719e664f89ef925be785885252469e9822fa940"
 "checksum fuchsia-cprng 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "81f7f8eb465745ea9b02e2704612a9946a59fa40572086c6fd49d6ddcf30bf31"
-"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1"
+"checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14"
 "checksum libc 0.2.45 (registry+https://github.com/rust-lang/crates.io-index)" = "2d2857ec59fadc0773853c664d2d18e7198e83883e7060b63c924cb077bd5c74"
 "checksum memchr 2.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "db4c41318937f6e76648f42826b1d9ade5c09cafb5aef7e351240a70f39206e9"
 "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1"
diff -r a362b0b95e42 -r ad489b44ef16 rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml	Fri Apr 05 14:35:33 2019 +0200
+++ b/rust/hg-core/Cargo.toml	Tue Apr 16 21:17:51 2019 +0200
@@ -10,3 +10,7 @@
 [dev-dependencies]
 rand = "*"
 rand_pcg = "*"
+
+[dependencies]
+lazy_static = "1.3.0"
+regex = "^1.1"
\ No newline at end of file
diff -r a362b0b95e42 -r ad489b44ef16 rust/hg-core/src/filepatterns.rs
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rust/hg-core/src/filepatterns.rs	Tue Apr 16 21:17:51 2019 +0200
@@ -0,0 +1,346 @@
+use crate::{LineNumber, PatternError, PatternFileError};
+use regex::Regex;
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::Read;
+use std::vec::Vec;
+
+lazy_static! {
+    static ref reescape: Vec<Vec<u8>> = {
+        let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
+        let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
+        for byte in to_escape {
+            v[*byte as usize].insert(0, b'\\');
+        }
+        v
+    };
+}
+
+const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
+    &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum PatternSyntax {
+    Regexp,
+    RootGlob,
+    // Glob that matches at the front of the path
+    Glob,
+    // Glob that matches at any suffix of the path (still anchored at slashes)
+    Path,
+    RelPath,
+    RelGlob,
+    RelRegexp,
+    RootFiles,
+}
+
+/// Transforms a glob pattern into a regex
+fn glob_to_re(pat: &[u8]) -> Vec<u8> {
+    let mut input = pat;
+    let mut res: Vec<u8> = vec![];
+    let mut group_depth = 0;
+
+    while let Some((c, rest)) = input.split_first() {
+        input = rest;
+
+        match c {
+            b'*' => {
+                for (source, repl) in GLOB_REPLACEMENTS {
+                    if input.starts_with(source) {
+                        input = &input[source.len()..];
+                        res.extend(repl.iter());
+                        break;
+                    }
+                }
+            }
+            b'?' => res.extend(b"."),
+            b'[' => {
+                match input
+                    .iter()
+                    .enumerate()
+                    .position(|(i, b)| *b == b']' && i != 0)
+                {
+                    None => res.extend(b"\\["),
+                    Some(end) => {
+                        res.extend(b"[");
+                        for (i, b) in input[..end].iter().enumerate() {
+                            if *b == b'!' && i == 0 {
+                                res.extend(b"^")
+                            } else if *b == b'^' && i == 0 {
+                                res.extend(b"\\^")
+                            } else if *b == b'\\' {
+                                res.extend(b"\\\\")
+                            } else {
+                                res.push(*b)
+                            }
+                        }
+                        res.extend(b"]");
+                        input = &input[end + 1..];
+                    }
+                }
+            }
+            b'{' => {
+                group_depth += 1;
+                res.extend(b"(?:")
+            }
+            b'}' if group_depth > 0 => {
+                group_depth -= 1;
+                res.extend(b")");
+            }
+            b',' if group_depth > 0 => res.extend(b"|"),
+            b'\\' => {
+                let c = {
+                    if let Some((c, rest)) = input.split_first() {
+                        input = rest;
+                        c
+                    } else {
+                        c
+                    }
+                };
+                res.extend(&reescape[*c as usize])
+            }
+            _ => res.extend(&reescape[*c as usize]),
+        }
+    }
+    res
+}
+
+fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
+    pattern
+        .iter()
+        .flat_map(|c| reescape[*c as usize].clone())
+        .collect()
+}
+
+fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> {
+    match kind {
+        b"re" => Ok(PatternSyntax::Regexp),
+        b"path" => Ok(PatternSyntax::Path),
+        b"relpath" => Ok(PatternSyntax::RelPath),
+        b"rootfilesin" => Ok(PatternSyntax::RootFiles),
+        b"relglob" => Ok(PatternSyntax::RelGlob),
+        b"relre" => Ok(PatternSyntax::RelRegexp),
+        b"glob" => Ok(PatternSyntax::Glob),
+        b"rootglob" => Ok(PatternSyntax::RootGlob),
+        _ => Err(PatternError::UnsupportedSyntax(
+            String::from_utf8_lossy(kind).to_string(),
+        )),
+    }
+}
+
+/// Builds the regex that corresponds to the given pattern.
+/// If within a `syntax: regexp` context, returns the pattern,
+/// otherwise, returns the corresponding regex.
+fn _build_single_regex(
+    syntax: PatternSyntax,
+    pattern: &[u8],
+    globsuffix: &[u8],
+) -> Vec<u8> {
+    if pattern.is_empty() {
+        return vec![];
+    }
+    match syntax {
+        PatternSyntax::Regexp => pattern.to_owned(),
+        PatternSyntax::RelRegexp => {
+            if pattern[0] == b'^' {
+                return pattern.to_owned();
+            }
+            let mut res = b".*".to_vec();
+            res.extend(pattern);
+            res
+        }
+        PatternSyntax::Path | PatternSyntax::RelPath => {
+            if pattern == b"." {
+                return vec![];
+            }
+            let mut pattern = escape_pattern(pattern);
+            pattern.extend(b"(?:/|$)");
+            pattern
+        }
+        PatternSyntax::RootFiles => {
+            let mut res = if pattern == b"." {
+                vec![]
+            } else {
+                // Pattern is a directory name.
+                let mut as_vec: Vec<u8> = escape_pattern(pattern);
+                as_vec.push(b'/');
+                as_vec
+            };
+
+            // Anything after the pattern must be a non-directory.
+            res.extend(b"[^/]+$");
+            res
+        }
+        PatternSyntax::Glob
+        | PatternSyntax::RelGlob
+        | PatternSyntax::RootGlob => {
+            let mut res: Vec<u8> = vec![];
+            if syntax == PatternSyntax::RelGlob {
+                res.extend(b"(?:|.*/)");
+            }
+
+            res.extend(glob_to_re(pattern));
+            res.extend(globsuffix.iter());
+            res
+        }
+    }
+}
+
+const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
+    [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
+
+/// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
+/// that don't need to be transformed into a regex.
+pub fn build_single_regex(
+    kind: &str,
+    pat: &[u8],
+    globsuffix: &[u8],
+) -> Result<Vec<u8>, PatternError> {
+    let enum_kind = parse_pattern_syntax(kind.as_bytes())?;
+    if enum_kind == PatternSyntax::RootGlob
+        && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
+    {
+        Ok(pat.to_vec())
+    } else {
+        Ok(_build_single_regex(enum_kind, pat, globsuffix))
+    }
+}
+
+lazy_static! {
+    static ref SYNTAXES: HashMap<&'static str, &'static str> = {
+        let mut m = HashMap::new();
+
+        m.insert("re", "relre:");
+        m.insert("regexp", "relre:");
+        m.insert("glob", "relglob:");
+        m.insert("rootglob", "rootglob:");
+        m.insert("include", "include");
+        m.insert("subinclude", "subinclude");
+        m
+    };
+}
+
+pub type PatternTuple = (String, LineNumber, String);
+type WarningTuple = (String, String);
+
+pub fn parse_pattern_file_contents(
+    lines: &str,
+    file_path: &str,
+    warn: bool,
+) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
+    let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
+    let mut inputs: Vec<PatternTuple> = vec![];
+    let mut warnings: Vec<WarningTuple> = vec![];
+
+    let mut current_syntax = Cow::Borrowed("relre:");
+
+    for (line_number, line) in lines.split('\n').enumerate() {
+        let line_number = line_number + 1;
+        let mut line = line.to_string();
+        let mut syntax = current_syntax.to_string();
+        let mut syntax = syntax.as_ref();
+
+        if line.contains('#') {
+            if let Some(cap) = comment_regex.captures(line.clone().as_ref()) {
+                line = line[..cap.get(1).unwrap().end()].to_string()
+            }
+            line = line.replace(r"\#", "#");
+        }
+        line = str::trim_end(line.as_ref()).to_string();
+
+        if line.is_empty() {
+            continue;
+        }
+
+        if line.starts_with("syntax:") {
+            syntax = str::trim(&line["syntax:".len()..]);
+
+            if let Some(rel_syntax) = SYNTAXES.get(syntax) {
+                current_syntax = Cow::Owned(rel_syntax.to_string());;
+            } else if warn {
+                warnings.push((file_path.to_string(), syntax.to_string()));
+            }
+            continue;
+        }
+
+        let mut line_syntax = syntax;
+        let mut final_line = line.clone();
+
+        for (s, rels) in SYNTAXES.iter() {
+            if final_line.starts_with(rels) {
+                line_syntax = rels;
+                final_line = line[rels.len()..].to_string();
+                break;
+            } else if final_line.starts_with(&format!("{}:", s)) {
+                line_syntax = rels;
+                final_line = line[s.len() + 1..].to_string();
+                break;
+            }
+        }
+
+        inputs.push((
+            line_syntax.to_string() + &final_line,
+            line_number,
+            final_line,
+        ));
+        current_syntax = Cow::Owned(syntax.to_string());
+    }
+    (inputs, warnings)
+}
+
+pub fn read_pattern_file(
+    file_path: String,
+    warn: bool,
+) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
+    let mut f = File::open(&file_path)?;
+    let mut contents = String::new();
+    f.read_to_string(&mut contents).map(|_| {
+        Ok(parse_pattern_file_contents(&contents, &file_path, warn))
+    })?
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn escape_pattern_test() {
+        let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
+        assert_eq!(escape_pattern(untouched), untouched.to_vec());
+        // All escape codes
+        assert_eq!(
+            escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
+            br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
+                .to_vec()
+        );
+    }
+
+    #[test]
+    fn glob_test() {
+        assert_eq!(glob_to_re(br#"?"#), br#"."#);
+        assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#);
+        assert_eq!(glob_to_re(br#"**"#), br#".*"#);
+        assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#);
+        assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#);
+        assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#);
+        assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#);
+        assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
+    }
+
+    #[test]
+    fn test_parse_pattern_file_contents() {
+        let lines = "syntax: glob\n*.elc";
+
+        assert_eq!(
+            vec![("relglob:*.elc".to_string(), 0, "*.elc".to_string())],
+            parse_pattern_file_contents(lines, "file_path", false).0,
+        );
+
+        let lines = "syntax: include\nsyntax: glob";
+
+        assert_eq!(
+            parse_pattern_file_contents(lines, "file_path", false).0,
+            vec![]
+        );
+    }
+}
diff -r a362b0b95e42 -r ad489b44ef16 rust/hg-core/src/lib.rs
--- a/rust/hg-core/src/lib.rs	Fri Apr 05 14:35:33 2019 +0200
+++ b/rust/hg-core/src/lib.rs	Tue Apr 16 21:17:51 2019 +0200
@@ -4,8 +4,18 @@
 // GNU General Public License version 2 or any later version.
 mod ancestors;
 pub mod dagops;
+
 pub use ancestors::{AncestorsIterator, LazyAncestors, MissingAncestors};
 pub mod testing;  // unconditionally built, for use from integration tests
+mod filepatterns;
+
+pub use filepatterns::{
+    build_single_regex, read_pattern_file, PatternSyntax, PatternTuple
+};
+
+#[macro_use]
+extern crate lazy_static;
+extern crate regex;
 
 /// Mercurial revision numbers
 ///
@@ -34,8 +44,27 @@
     fn parents(&self, Revision) -> Result<[Revision; 2], GraphError>;
 }
 
+pub type LineNumber = usize;
+
 #[derive(Clone, Debug, PartialEq)]
 pub enum GraphError {
     ParentOutOfRange(Revision),
     WorkingDirectoryUnsupported,
 }
+
+#[derive(Debug)]
+pub enum PatternError {
+    UnsupportedSyntax(String),
+}
+
+#[derive(Debug)]
+pub enum PatternFileError {
+    IO(std::io::Error),
+    Pattern(PatternError, LineNumber),
+}
+
+impl From<std::io::Error> for PatternFileError {
+    fn from(e: std::io::Error) -> Self {
+        PatternFileError::IO(e)
+    }
+}
\ No newline at end of file


More information about the Mercurial-devel mailing list