D7866: rust-pathauditor: add Rust implementation of the `pathauditor`

Alphare (Raphaël Gomès) phabricator at mercurial-scm.org
Thu Jan 16 05:23:19 EST 2020


Alphare updated this revision to Diff 19351.

REPOSITORY
  rHG Mercurial

CHANGES SINCE LAST UPDATE
  https://phab.mercurial-scm.org/D7866?vs=19317&id=19351

BRANCH
  default

CHANGES SINCE LAST ACTION
  https://phab.mercurial-scm.org/D7866/new/

REVISION DETAIL
  https://phab.mercurial-scm.org/D7866

AFFECTED FILES
  rust/Cargo.lock
  rust/hg-core/Cargo.toml
  rust/hg-core/src/utils.rs
  rust/hg-core/src/utils/files.rs
  rust/hg-core/src/utils/hg_path.rs
  rust/hg-core/src/utils/path_auditor.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/utils/path_auditor.rs b/rust/hg-core/src/utils/path_auditor.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/utils/path_auditor.rs
@@ -0,0 +1,235 @@
+// path_auditor.rs
+//
+// Copyright 2020
+// Raphaël Gomès <rgomes at octobus.net>,
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use crate::utils::{
+    files::lower_clean,
+    find_slice_in_slice,
+    hg_path::{hg_path_to_path_buf, HgPath, HgPathBuf, HgPathError},
+};
+use std::collections::HashSet;
+use std::path::{Path, PathBuf};
+
+/// Ensures that a path is valid for use in the repository i.e. does not use
+/// any banned components, does not traverse a symlink, etc.
+#[derive(Debug, Default)]
+pub struct PathAuditor {
+    audited: HashSet<HgPathBuf>,
+    audited_dirs: HashSet<HgPathBuf>,
+    root: PathBuf,
+}
+
+impl PathAuditor {
+    pub fn new(root: impl AsRef<Path>) -> Self {
+        Self {
+            root: root.as_ref().to_owned(),
+            ..Default::default()
+        }
+    }
+    pub fn audit_path(
+        &mut self,
+        path: impl AsRef<HgPath>,
+    ) -> Result<(), HgPathError> {
+        // TODO windows "localpath" normalization
+        let path = path.as_ref();
+        if path.is_empty() {
+            return Ok(());
+        }
+        // TODO case normalization
+        if self.audited.contains(path) {
+            return Ok(());
+        }
+        // AIX ignores "/" at end of path, others raise EISDIR.
+        let last_byte = path.as_bytes()[path.len() - 1];
+        if last_byte == b'/' || last_byte == b'\\' {
+            return Err(HgPathError::EndsWithSlash(path.to_owned()));
+        }
+        let parts: Vec<_> = path
+            .as_bytes()
+            .split(|b| std::path::is_separator(*b as char))
+            .collect();
+        if !path.split_drive().0.is_empty()
+            || [&b".hg"[..], &b".hg."[..], &b""[..]]
+                .contains(&lower_clean(parts[0]).as_ref())
+            || parts.iter().any(|c| c == b"..")
+        {
+            return Err(HgPathError::ContainsIllegalComponent(
+                path.to_owned(),
+            ));
+        }
+
+        // Windows shortname aliases
+        for part in parts.iter() {
+            if part.contains(&b'~') {
+                let mut split = part.splitn(1, |b| *b == b'~');
+                let mut first = split.next().unwrap().to_owned();
+                first.make_ascii_uppercase();
+                let last = split.next().unwrap();
+                if last.iter().all(|b| b.is_ascii_digit())
+                    && [&b"HG"[..], &b"HG8B6C"[..]].contains(&first.as_ref())
+                {
+                    return Err(HgPathError::ContainsIllegalComponent(
+                        path.to_owned(),
+                    ));
+                }
+            }
+        }
+        if find_slice_in_slice(&lower_clean(path.as_bytes()), b".hg").is_some()
+        {
+            let lower_parts: Vec<_> =
+                parts.iter().map(|p| lower_clean(p)).collect();
+            for pattern in [b".hg".to_vec(), b".hg.".to_vec()].iter() {
+                if lower_parts[1..].contains(pattern) {
+                    let pos = lower_parts
+                        .iter()
+                        .position(|part| part == pattern)
+                        .unwrap();
+                    let base = lower_parts[..pos]
+                        .iter()
+                        .fold(HgPathBuf::new(), |acc, p| {
+                            acc.join(HgPath::new(p))
+                        });
+                    return Err(HgPathError::IsInsideNestedRepo {
+                        path: path.to_owned(),
+                        nested_repo: base,
+                    });
+                }
+            }
+        }
+
+        let parts = &parts[..parts.len().saturating_sub(1)];
+
+        let mut prefixes = vec![];
+
+        // It's important that we check the path parts starting from the root.
+        // This means we won't accidentally traverse a symlink into some other
+        // filesystem (which is potentially expensive to access).
+        for index in 0..parts.len() {
+            let prefix =
+                &parts[..index + 1].join(&(std::path::MAIN_SEPARATOR as u8));
+            let prefix = HgPath::new(prefix);
+            if self.audited_dirs.contains(prefix) {
+                continue;
+            }
+            self.check_filesystem(&prefix, &path)?;
+            prefixes.push(prefix.to_owned());
+        }
+
+        self.audited.insert(path.to_owned());
+        // Only add prefixes to the cache after checking everything: we don't
+        // want to add "foo/bar/baz" before checking if there's a "foo/.hg"
+        self.audited_dirs.extend(prefixes);
+
+        Ok(())
+    }
+
+    pub fn check_filesystem(
+        &self,
+        prefix: impl AsRef<HgPath>,
+        path: impl AsRef<HgPath>,
+    ) -> Result<(), HgPathError> {
+        let prefix = prefix.as_ref();
+        let path = path.as_ref();
+        let current_path = self.root.join(
+            hg_path_to_path_buf(prefix)
+                .map_err(|_| HgPathError::NotFsCompliant(path.to_owned()))?,
+        );
+        match std::fs::symlink_metadata(&current_path) {
+            Err(e) => {
+                // EINVAL can be raised as invalid path syntax under win32.
+                // They must be ignored for patterns can be checked too.
+                if e.kind() != std::io::ErrorKind::NotFound
+                    && e.kind() != std::io::ErrorKind::InvalidInput
+                    && e.raw_os_error() != Some(20)
+                {
+                    eprintln!("{:?}", e.kind());
+                    // Rust does not yet have an `ErrorKind` for
+                    // `NotADirectory` (errno 20)
+                    // It happens if the dirstate contains `foo/bar` and
+                    // foo is not a directory
+                    return Err(HgPathError::NotFsCompliant(path.to_owned()));
+                }
+            }
+            Ok(meta) => {
+                if meta.file_type().is_symlink() {
+                    return Err(HgPathError::TraversesSymbolicLink {
+                        path: path.to_owned(),
+                        symlink: prefix.to_owned(),
+                    });
+                }
+                if meta.file_type().is_dir()
+                    && current_path.join(".hg").is_dir()
+                {
+                    return Err(HgPathError::IsInsideNestedRepo {
+                        path: path.to_owned(),
+                        nested_repo: prefix.to_owned(),
+                    });
+                }
+            }
+        };
+
+        Ok(())
+    }
+
+    pub fn check(&mut self, path: impl AsRef<HgPath>) -> bool {
+        self.audit_path(path).is_ok()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::utils::files::get_path_from_bytes;
+    use crate::utils::hg_path::path_to_hg_path_buf;
+
+    #[test]
+    fn test_path_auditor() {
+        let mut auditor = PathAuditor::new(get_path_from_bytes(b"/tmp"));
+
+        let path = HgPath::new(b".hg/00changelog.i");
+        assert_eq!(
+            auditor.audit_path(path),
+            Err(HgPathError::ContainsIllegalComponent(path.to_owned()))
+        );
+        let path = HgPath::new(b"this/is/nested/.hg/thing.txt");
+        assert_eq!(
+            auditor.audit_path(path),
+            Err(HgPathError::IsInsideNestedRepo {
+                path: path.to_owned(),
+                nested_repo: HgPathBuf::from_bytes(b"this/is/nested")
+            })
+        );
+
+        use std::fs::{create_dir, File};
+        use tempfile::tempdir;
+
+        let base_dir = tempdir().unwrap();
+        let base_dir_path = base_dir.path();
+        let a = base_dir_path.join("a");
+        let b = base_dir_path.join("b");
+        create_dir(&a).unwrap();
+        let in_a_path = a.join("in_a");
+        File::create(in_a_path).unwrap();
+
+        // TODO make portable
+        std::os::unix::fs::symlink(&a, &b).unwrap();
+
+        let buf = b.join("in_a").components().skip(2).collect::<PathBuf>();
+        eprintln!("buf: {}", buf.display());
+        let path = path_to_hg_path_buf(buf).unwrap();
+        assert_eq!(
+            auditor.audit_path(&path),
+            Err(HgPathError::TraversesSymbolicLink {
+                path: path,
+                symlink: path_to_hg_path_buf(
+                    b.components().skip(2).collect::<PathBuf>()
+                )
+                .unwrap()
+            })
+        );
+    }
+}
diff --git a/rust/hg-core/src/utils/hg_path.rs b/rust/hg-core/src/utils/hg_path.rs
--- a/rust/hg-core/src/utils/hg_path.rs
+++ b/rust/hg-core/src/utils/hg_path.rs
@@ -15,12 +15,33 @@
 pub enum HgPathError {
     /// Bytes from the invalid `HgPath`
     LeadingSlash(Vec<u8>),
-    /// Bytes and index of the second slash
-    ConsecutiveSlashes(Vec<u8>, usize),
-    /// Bytes and index of the null byte
-    ContainsNullByte(Vec<u8>, usize),
+    ConsecutiveSlashes {
+        bytes: Vec<u8>,
+        second_slash_index: usize,
+    },
+    ContainsNullByte {
+        bytes: Vec<u8>,
+        null_byte_index: usize,
+    },
     /// Bytes
     DecodeError(Vec<u8>),
+    /// The rest come from audit errors
+    EndsWithSlash(HgPathBuf),
+    ContainsIllegalComponent(HgPathBuf),
+    IsInsideNestedRepo {
+        path: HgPathBuf,
+        nested_repo: HgPathBuf,
+    },
+    TraversesSymbolicLink {
+        path: HgPathBuf,
+        symlink: HgPathBuf,
+    },
+    NotFsCompliant(HgPathBuf),
+    /// `path` is the smallest invalid path
+    NotUnderRoot {
+        path: PathBuf,
+        root: PathBuf,
+    },
 }
 
 impl ToString for HgPathError {
@@ -29,17 +50,51 @@
             HgPathError::LeadingSlash(bytes) => {
                 format!("Invalid HgPath '{:?}': has a leading slash.", bytes)
             }
-            HgPathError::ConsecutiveSlashes(bytes, pos) => format!(
-                "Invalid HgPath '{:?}': consecutive slahes at pos {}.",
+            HgPathError::ConsecutiveSlashes {
+                bytes,
+                second_slash_index: pos,
+            } => format!(
+                "Invalid HgPath '{:?}': consecutive slashes at pos {}.",
                 bytes, pos
             ),
-            HgPathError::ContainsNullByte(bytes, pos) => format!(
+            HgPathError::ContainsNullByte {
+                bytes,
+                null_byte_index: pos,
+            } => format!(
                 "Invalid HgPath '{:?}': contains null byte at pos {}.",
                 bytes, pos
             ),
             HgPathError::DecodeError(bytes) => {
                 format!("Invalid HgPath '{:?}': could not be decoded.", bytes)
             }
+            HgPathError::EndsWithSlash(path) => {
+                format!("Audit failed for '{}': ends with a slash.", path)
+            }
+            HgPathError::ContainsIllegalComponent(path) => format!(
+                "Audit failed for '{}': contains an illegal component.",
+                path
+            ),
+            HgPathError::IsInsideNestedRepo {
+                path,
+                nested_repo: nested,
+            } => format!(
+                "Audit failed for '{}': is inside a nested repository '{}'.",
+                path, nested
+            ),
+            HgPathError::TraversesSymbolicLink { path, symlink } => format!(
+                "Audit failed for '{}': traverses symbolic link '{}'.",
+                path, symlink
+            ),
+            HgPathError::NotFsCompliant(path) => format!(
+                "Audit failed for '{}': cannot be turned into a \
+                 filesystem path.",
+                path
+            ),
+            HgPathError::NotUnderRoot { path, root } => format!(
+                "Audit failed for '{}': not under root {}.",
+                path.display(),
+                root.display()
+            ),
         }
     }
 }
@@ -227,17 +282,17 @@
         for (index, byte) in bytes.iter().enumerate() {
             match byte {
                 0 => {
-                    return Err(HgPathError::ContainsNullByte(
-                        bytes.to_vec(),
-                        index,
-                    ))
+                    return Err(HgPathError::ContainsNullByte {
+                        bytes: bytes.to_vec(),
+                        null_byte_index: index,
+                    })
                 }
                 b'/' => {
                     if previous_byte.is_some() && previous_byte == Some(b'/') {
-                        return Err(HgPathError::ConsecutiveSlashes(
-                            bytes.to_vec(),
-                            index,
-                        ));
+                        return Err(HgPathError::ConsecutiveSlashes {
+                            bytes: bytes.to_vec(),
+                            second_slash_index: index,
+                        });
                     }
                 }
                 _ => (),
@@ -429,11 +484,17 @@
             HgPath::new(b"/").check_state()
         );
         assert_eq!(
-            Err(HgPathError::ConsecutiveSlashes(b"a/b//c".to_vec(), 4)),
+            Err(HgPathError::ConsecutiveSlashes {
+                bytes: b"a/b//c".to_vec(),
+                second_slash_index: 4
+            }),
             HgPath::new(b"a/b//c").check_state()
         );
         assert_eq!(
-            Err(HgPathError::ContainsNullByte(b"a/b/\0c".to_vec(), 4)),
+            Err(HgPathError::ContainsNullByte {
+                bytes: b"a/b/\0c".to_vec(),
+                null_byte_index: 4
+            }),
             HgPath::new(b"a/b/\0c").check_state()
         );
         // TODO test HgPathError::DecodeError for the Windows implementation.
diff --git a/rust/hg-core/src/utils/files.rs b/rust/hg-core/src/utils/files.rs
--- a/rust/hg-core/src/utils/files.rs
+++ b/rust/hg-core/src/utils/files.rs
@@ -12,6 +12,8 @@
 use crate::utils::hg_path::{HgPath, HgPathBuf};
 use std::iter::FusedIterator;
 
+use crate::utils::replace_slice;
+use lazy_static::lazy_static;
 use std::fs::Metadata;
 use std::path::Path;
 
@@ -85,6 +87,41 @@
     path.to_ascii_lowercase()
 }
 
+lazy_static! {
+    static ref IGNORED_CHARS: Vec<Vec<u8>> = {
+        [
+            0x200c, 0x200d, 0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d,
+            0x202e, 0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
+        ]
+        .iter()
+        .map(|code| {
+            std::char::from_u32(*code)
+                .unwrap()
+                .encode_utf8(&mut [0; 3])
+                .bytes()
+                .collect()
+        })
+        .collect()
+    };
+}
+
+fn hfs_ignore_clean(bytes: &[u8]) -> Vec<u8> {
+    let mut buf = bytes.to_owned();
+    let needs_escaping = bytes.iter().any(|b| *b == b'\xe2' || *b == b'\xef');
+    if needs_escaping {
+        for forbidden in IGNORED_CHARS.iter() {
+            replace_slice(&mut buf, forbidden, &[])
+        }
+        buf
+    } else {
+        buf
+    }
+}
+
+pub fn lower_clean(bytes: &[u8]) -> Vec<u8> {
+    hfs_ignore_clean(&bytes.to_ascii_lowercase())
+}
+
 #[derive(Eq, PartialEq, Ord, PartialOrd, Copy, Clone)]
 pub struct HgMetadata {
     pub st_dev: u64,
diff --git a/rust/hg-core/src/utils.rs b/rust/hg-core/src/utils.rs
--- a/rust/hg-core/src/utils.rs
+++ b/rust/hg-core/src/utils.rs
@@ -9,6 +9,7 @@
 
 pub mod files;
 pub mod hg_path;
+pub mod path_auditor;
 
 /// Useful until rust/issues/56345 is stable
 ///
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -17,3 +17,6 @@
 rayon = "1.2.0"
 regex = "1.1.0"
 twox-hash = "1.5.0"
+
+[dev-dependencies]
+tempfile = "3.1.0"
\ No newline at end of file
diff --git a/rust/Cargo.lock b/rust/Cargo.lock
--- a/rust/Cargo.lock
+++ b/rust/Cargo.lock
@@ -134,6 +134,7 @@
  "rand_pcg 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)",
  "rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)",
+ "tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)",
  "twox-hash 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)",
 ]
 
@@ -385,6 +386,11 @@
 ]
 
 [[package]]
+name = "redox_syscall"
+version = "0.1.56"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+
+[[package]]
 name = "regex"
 version = "1.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -401,6 +407,14 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
+name = "remove_dir_all"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
 name = "rustc_version"
 version = "0.2.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -427,6 +441,19 @@
 source = "registry+https://github.com/rust-lang/crates.io-index"
 
 [[package]]
+name = "tempfile"
+version = "3.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+dependencies = [
+ "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)",
+ "libc 0.2.64 (registry+https://github.com/rust-lang/crates.io-index)",
+ "rand 0.7.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)",
+ "remove_dir_all 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
+ "winapi 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
+]
+
+[[package]]
 name = "thread_local"
 version = "0.3.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
@@ -510,12 +537,15 @@
 "checksum rayon 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "83a27732a533a1be0a0035a111fe76db89ad312f6f0347004c220c57f209a123"
 "checksum rayon-core 1.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "98dcf634205083b17d0861252431eb2acbfb698ab7478a2d20de07954f47ec7b"
 "checksum rdrand 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "678054eb77286b51581ba43620cc911abf02758c91f93f479767aed0f90458b2"
+"checksum redox_syscall 0.1.56 (registry+https://github.com/rust-lang/crates.io-index)" = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
 "checksum regex 1.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "dc220bd33bdce8f093101afe22a037b8eb0e5af33592e6a9caafff0d4cb81cbd"
 "checksum regex-syntax 0.6.12 (registry+https://github.com/rust-lang/crates.io-index)" = "11a7e20d1cce64ef2fed88b66d347f88bd9babb82845b2b858f3edbf59a4f716"
+"checksum remove_dir_all 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)" = "4a83fa3702a688b9359eccba92d153ac33fd2e8462f9e0e3fdf155239ea7792e"
 "checksum rustc_version 0.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "138e3e0acb6c9fb258b19b67cb8abd63c00679d2851805ea151465464fe9030a"
 "checksum scopeguard 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b42e15e59b18a828bbf5c58ea01debb36b9b096346de35d941dcb89009f24a0d"
 "checksum semver 0.9.0 (registry+https://github.com/rust-lang/crates.io-index)" = "1d7eb9ef2c18661902cc47e535f9bc51b78acd254da71d375c2f6720d9a40403"
 "checksum semver-parser 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "388a1df253eca08550bef6c72392cfe7c30914bf41df5269b68cbd6ff8f570a3"
+"checksum tempfile 3.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7a6e24d9338a0a5be79593e2fa15a648add6138caa803e2d5bc782c371732ca9"
 "checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b"
 "checksum twox-hash 1.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "3bfd5b7557925ce778ff9b9ef90e3ade34c524b5ff10e239c69a42d546d2af56"
 "checksum wasi 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b89c3ce4ce14bdc6fb6beaf9ec7928ca331de5df7e5ea278375642a2f478570d"



To: Alphare, #hg-reviewers, kevincox
Cc: durin42, kevincox, mercurial-devel


More information about the Mercurial-devel mailing list