D6271: rust-filepatterns: add a Rust implementation of pattern-related utils

Alphare (Raphaël Gomès) phabricator at mercurial-scm.org
Thu Apr 18 12:11:07 UTC 2019


Alphare created this revision.
Herald added subscribers: mercurial-devel, kevincox, durin42.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  This change introduces Rust implementations of two functions related to
  pattern handling, all located in `match.py`:
  
  - `_regex`
  - `readpatternfile`
  
  These utils are useful in the long-term effort to improve `hg status`'s
  performance using Rust. Experimental work done by Valentin Gatien-Baron
  shows very promising improvements, but is too different from the current
  Mercurial core code structure to be used "as-is".
  This is the first - albeit very small - step towards the code revamp
  needed down the line.
  
  Two dependencies were added: `regex` and `lazy_static`. Both of them
  will be useful for a majority of the Rust code that will be written,
  are well known and maintained either by the Rust core team, or by
  very frequent contributors.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6271

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/filepatterns.rs
  rust/hg-core/src/lib.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/lib.rs b/rust/hg-core/src/lib.rs
--- a/rust/hg-core/src/lib.rs
+++ b/rust/hg-core/src/lib.rs
@@ -4,8 +4,18 @@
 // GNU General Public License version 2 or any later version.
 mod ancestors;
 pub mod dagops;
+
 pub use ancestors::{AncestorsIterator, LazyAncestors, MissingAncestors};
 pub mod testing;  // unconditionally built, for use from integration tests
+mod filepatterns;
+
+pub use filepatterns::{
+    build_single_regex, read_pattern_file, PatternSyntax, PatternTuple
+};
+
+#[macro_use]
+extern crate lazy_static;
+extern crate regex;
 
 /// Mercurial revision numbers
 ///
@@ -34,8 +44,27 @@
     fn parents(&self, Revision) -> Result<[Revision; 2], GraphError>;
 }
 
+pub type LineNumber = usize;
+
 #[derive(Clone, Debug, PartialEq)]
 pub enum GraphError {
     ParentOutOfRange(Revision),
     WorkingDirectoryUnsupported,
 }
+
+#[derive(Debug)]
+pub enum PatternError {
+    UnsupportedSyntax(String),
+}
+
+#[derive(Debug)]
+pub enum PatternFileError {
+    IO(std::io::Error),
+    Pattern(PatternError, LineNumber),
+}
+
+impl From<std::io::Error> for PatternFileError {
+    fn from(e: std::io::Error) -> Self {
+        PatternFileError::IO(e)
+    }
+}
\ No newline at end of file
diff --git a/rust/hg-core/src/filepatterns.rs b/rust/hg-core/src/filepatterns.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/filepatterns.rs
@@ -0,0 +1,346 @@
+use crate::{LineNumber, PatternError, PatternFileError};
+use regex::Regex;
+use std::borrow::Cow;
+use std::collections::HashMap;
+use std::fs::File;
+use std::io::Read;
+use std::vec::Vec;
+
+lazy_static! {
+    static ref reescape: Vec<Vec<u8>> = {
+        let mut v: Vec<Vec<u8>> = (0..=255).map(|byte| vec![byte]).collect();
+        let to_escape = b"()[]{}?*+-|^$\\.&~# \t\n\r\x0b\x0c";
+        for byte in to_escape {
+            v[*byte as usize].insert(0, b'\\');
+        }
+        v
+    };
+}
+
+const GLOB_REPLACEMENTS: &[(&[u8], &[u8])] =
+    &[(b"*/", b"(?:.*/)?"), (b"*", b".*"), (b"", b"[^/]*")];
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub enum PatternSyntax {
+    Regexp,
+    RootGlob,
+    // Glob that matches at the front of the path
+    Glob,
+    // Glob that matches at any suffix of the path (still anchored at slashes)
+    Path,
+    RelPath,
+    RelGlob,
+    RelRegexp,
+    RootFiles,
+}
+
+/// Transforms a glob pattern into a regex
+fn glob_to_re(pat: &[u8]) -> Vec<u8> {
+    let mut input = pat;
+    let mut res: Vec<u8> = vec![];
+    let mut group_depth = 0;
+
+    while let Some((c, rest)) = input.split_first() {
+        input = rest;
+
+        match c {
+            b'*' => {
+                for (source, repl) in GLOB_REPLACEMENTS {
+                    if input.starts_with(source) {
+                        input = &input[source.len()..];
+                        res.extend(repl.iter());
+                        break;
+                    }
+                }
+            }
+            b'?' => res.extend(b"."),
+            b'[' => {
+                match input
+                    .iter()
+                    .enumerate()
+                    .position(|(i, b)| *b == b']' && i != 0)
+                {
+                    None => res.extend(b"\\["),
+                    Some(end) => {
+                        res.extend(b"[");
+                        for (i, b) in input[..end].iter().enumerate() {
+                            if *b == b'!' && i == 0 {
+                                res.extend(b"^")
+                            } else if *b == b'^' && i == 0 {
+                                res.extend(b"\\^")
+                            } else if *b == b'\\' {
+                                res.extend(b"\\\\")
+                            } else {
+                                res.push(*b)
+                            }
+                        }
+                        res.extend(b"]");
+                        input = &input[end + 1..];
+                    }
+                }
+            }
+            b'{' => {
+                group_depth += 1;
+                res.extend(b"(?:")
+            }
+            b'}' if group_depth > 0 => {
+                group_depth -= 1;
+                res.extend(b")");
+            }
+            b',' if group_depth > 0 => res.extend(b"|"),
+            b'\\' => {
+                let c = {
+                    if let Some((c, rest)) = input.split_first() {
+                        input = rest;
+                        c
+                    } else {
+                        c
+                    }
+                };
+                res.extend(&reescape[*c as usize])
+            }
+            _ => res.extend(&reescape[*c as usize]),
+        }
+    }
+    res
+}
+
+fn escape_pattern(pattern: &[u8]) -> Vec<u8> {
+    pattern
+        .iter()
+        .flat_map(|c| reescape[*c as usize].clone())
+        .collect()
+}
+
+fn parse_pattern_syntax(kind: &[u8]) -> Result<PatternSyntax, PatternError> {
+    match kind {
+        b"re" => Ok(PatternSyntax::Regexp),
+        b"path" => Ok(PatternSyntax::Path),
+        b"relpath" => Ok(PatternSyntax::RelPath),
+        b"rootfilesin" => Ok(PatternSyntax::RootFiles),
+        b"relglob" => Ok(PatternSyntax::RelGlob),
+        b"relre" => Ok(PatternSyntax::RelRegexp),
+        b"glob" => Ok(PatternSyntax::Glob),
+        b"rootglob" => Ok(PatternSyntax::RootGlob),
+        _ => Err(PatternError::UnsupportedSyntax(
+            String::from_utf8_lossy(kind).to_string(),
+        )),
+    }
+}
+
+/// Builds the regex that corresponds to the given pattern.
+/// If within a `syntax: regexp` context, returns the pattern,
+/// otherwise, returns the corresponding regex.
+fn _build_single_regex(
+    syntax: PatternSyntax,
+    pattern: &[u8],
+    globsuffix: &[u8],
+) -> Vec<u8> {
+    if pattern.is_empty() {
+        return vec![];
+    }
+    match syntax {
+        PatternSyntax::Regexp => pattern.to_owned(),
+        PatternSyntax::RelRegexp => {
+            if pattern[0] == b'^' {
+                return pattern.to_owned();
+            }
+            let mut res = b".*".to_vec();
+            res.extend(pattern);
+            res
+        }
+        PatternSyntax::Path | PatternSyntax::RelPath => {
+            if pattern == b"." {
+                return vec![];
+            }
+            let mut pattern = escape_pattern(pattern);
+            pattern.extend(b"(?:/|$)");
+            pattern
+        }
+        PatternSyntax::RootFiles => {
+            let mut res = if pattern == b"." {
+                vec![]
+            } else {
+                // Pattern is a directory name.
+                let mut as_vec: Vec<u8> = escape_pattern(pattern);
+                as_vec.push(b'/');
+                as_vec
+            };
+
+            // Anything after the pattern must be a non-directory.
+            res.extend(b"[^/]+$");
+            res
+        }
+        PatternSyntax::Glob
+        | PatternSyntax::RelGlob
+        | PatternSyntax::RootGlob => {
+            let mut res: Vec<u8> = vec![];
+            if syntax == PatternSyntax::RelGlob {
+                res.extend(b"(?:|.*/)");
+            }
+
+            res.extend(glob_to_re(pattern));
+            res.extend(globsuffix.iter());
+            res
+        }
+    }
+}
+
+const GLOB_SPECIAL_CHARACTERS: [u8; 7] =
+    [b'*', b'?', b'[', b']', b'{', b'}', b'\\'];
+
+/// Wrapper function to `_build_single_regex` that short-circuits 'exact' globs
+/// that don't need to be transformed into a regex.
+pub fn build_single_regex(
+    kind: &str,
+    pat: &[u8],
+    globsuffix: &[u8],
+) -> Result<Vec<u8>, PatternError> {
+    let enum_kind = parse_pattern_syntax(kind.as_bytes())?;
+    if enum_kind == PatternSyntax::RootGlob
+        && pat.iter().all(|b| GLOB_SPECIAL_CHARACTERS.contains(b))
+    {
+        Ok(pat.to_vec())
+    } else {
+        Ok(_build_single_regex(enum_kind, pat, globsuffix))
+    }
+}
+
+lazy_static! {
+    static ref SYNTAXES: HashMap<&'static str, &'static str> = {
+        let mut m = HashMap::new();
+
+        m.insert("re", "relre:");
+        m.insert("regexp", "relre:");
+        m.insert("glob", "relglob:");
+        m.insert("rootglob", "rootglob:");
+        m.insert("include", "include");
+        m.insert("subinclude", "subinclude");
+        m
+    };
+}
+
+pub type PatternTuple = (String, LineNumber, String);
+type WarningTuple = (String, String);
+
+pub fn parse_pattern_file_contents(
+    lines: &str,
+    file_path: &str,
+    warn: bool,
+) -> (Vec<PatternTuple>, Vec<WarningTuple>) {
+    let comment_regex = Regex::new(r"((?:^|[^\\])(?:\\\\)*)#.*").unwrap();
+    let mut inputs: Vec<PatternTuple> = vec![];
+    let mut warnings: Vec<WarningTuple> = vec![];
+
+    let mut current_syntax = Cow::Borrowed("relre:");
+
+    for (line_number, line) in lines.split('\n').enumerate() {
+        let line_number = line_number + 1;
+        let mut line = line.to_string();
+        let mut syntax = current_syntax.to_string();
+        let mut syntax = syntax.as_ref();
+
+        if line.contains('#') {
+            if let Some(cap) = comment_regex.captures(line.clone().as_ref()) {
+                line = line[..cap.get(1).unwrap().end()].to_string()
+            }
+            line = line.replace(r"\#", "#");
+        }
+        line = str::trim_end(line.as_ref()).to_string();
+
+        if line.is_empty() {
+            continue;
+        }
+
+        if line.starts_with("syntax:") {
+            syntax = str::trim(&line["syntax:".len()..]);
+
+            if let Some(rel_syntax) = SYNTAXES.get(syntax) {
+                current_syntax = Cow::Owned(rel_syntax.to_string());;
+            } else if warn {
+                warnings.push((file_path.to_string(), syntax.to_string()));
+            }
+            continue;
+        }
+
+        let mut line_syntax = syntax;
+        let mut final_line = line.clone();
+
+        for (s, rels) in SYNTAXES.iter() {
+            if final_line.starts_with(rels) {
+                line_syntax = rels;
+                final_line = line[rels.len()..].to_string();
+                break;
+            } else if final_line.starts_with(&format!("{}:", s)) {
+                line_syntax = rels;
+                final_line = line[s.len() + 1..].to_string();
+                break;
+            }
+        }
+
+        inputs.push((
+            line_syntax.to_string() + &final_line,
+            line_number,
+            final_line,
+        ));
+        current_syntax = Cow::Owned(syntax.to_string());
+    }
+    (inputs, warnings)
+}
+
+pub fn read_pattern_file(
+    file_path: String,
+    warn: bool,
+) -> Result<(Vec<PatternTuple>, Vec<WarningTuple>), PatternFileError> {
+    let mut f = File::open(&file_path)?;
+    let mut contents = String::new();
+    f.read_to_string(&mut contents).map(|_| {
+        Ok(parse_pattern_file_contents(&contents, &file_path, warn))
+    })?
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn escape_pattern_test() {
+        let untouched = br#"!"%',/0123456789:;<=>@ABCDEFGHIJKLMNOPQRSTUVWXYZ_`abcdefghijklmnopqrstuvwxyz"#;
+        assert_eq!(escape_pattern(untouched), untouched.to_vec());
+        // All escape codes
+        assert_eq!(
+            escape_pattern(br#"()[]{}?*+-|^$\\.&~# \t\n\r\v\f"#),
+            br#"\(\)\[\]\{\}\?\*\+\-\|\^\$\\\\\.\&\~\#\ \\t\\n\\r\\v\\f"#
+                .to_vec()
+        );
+    }
+
+    #[test]
+    fn glob_test() {
+        assert_eq!(glob_to_re(br#"?"#), br#"."#);
+        assert_eq!(glob_to_re(br#"*"#), br#"[^/]*"#);
+        assert_eq!(glob_to_re(br#"**"#), br#".*"#);
+        assert_eq!(glob_to_re(br#"**/a"#), br#"(?:.*/)?a"#);
+        assert_eq!(glob_to_re(br#"a/**/b"#), br#"a/(?:.*/)?b"#);
+        assert_eq!(glob_to_re(br#"[a*?!^][^b][!c]"#), br#"[a*?!^][\^b][^c]"#);
+        assert_eq!(glob_to_re(br#"{a,b}"#), br#"(?:a|b)"#);
+        assert_eq!(glob_to_re(br#".\*\?"#), br#"\.\*\?"#);
+    }
+
+    #[test]
+    fn test_parse_pattern_file_contents() {
+        let lines = "syntax: glob\n*.elc";
+
+        assert_eq!(
+            vec![("relglob:*.elc".to_string(), 0, "*.elc".to_string())],
+            parse_pattern_file_contents(lines, "file_path", false).0,
+        );
+
+        let lines = "syntax: include\nsyntax: glob";
+
+        assert_eq!(
+            parse_pattern_file_contents(lines, "file_path", false).0,
+            vec![]
+        );
+    }
+}
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -10,3 +10,7 @@
 [dev-dependencies]
 rand = "*"
 rand_pcg = "*"
+
+[dependencies]
+lazy_static = "1.3.0"
+regex = "^1.1"
\ No newline at end of file
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -49,8 +49,10 @@
 name = "hg-core"
 version = "0.1.0"
 dependencies = [
+ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)",
  "rand_pcg 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "regex 1.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -74,7 +76,7 @@
 
 [[package]]
 name = "lazy_static"
-version = "1.2.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
@@ -265,7 +267,7 @@
 version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 dependencies = [
- "lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
+ "lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
 [[package]]
@@ -310,7 +312,7 @@
 "checksum cloudabi 0.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "ddfc5b9aa5d4507acaf872de71051dfd0e309860e88966e1051e462a077aac4f"
 "checksum cpython 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b489034e723e7f5109fecd19b719e664f89ef925be785885252469e9822fa940"
 "checksum fuchsia-cprng 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "81f7f8eb465745ea9b02e2704612a9946a59fa40572086c6fd49d6ddcf30bf31"
-"checksum lazy_static 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a374c89b9db55895453a74c1e38861d9deec0b01b405a82516e9d5de4820dea1"
+"checksum lazy_static 1.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bc5729f27f159ddd61f4df6228e827e86643d4d3e7c32183cb30a1c08f604a14"
 "checksum libc 0.2.45 (registry+https://github.com/rust-lang/crates.io-index)" = "2d2857ec59fadc0773853c664d2d18e7198e83883e7060b63c924cb077bd5c74"
 "checksum memchr 2.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "db4c41318937f6e76648f42826b1d9ade5c09cafb5aef7e351240a70f39206e9"
 "checksum num-traits 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "0b3a5d7cc97d6d30d8b9bc8fa19bf45349ffe46241e8816f50f62f6d6aaabee1"



To: Alphare, #hg-reviewers
Cc: durin42, kevincox, mercurial-devel


More information about the Mercurial-devel mailing list