D7929: rust-status: add bare `hg status` support in hg-core

Alphare (Raphaël Gomès) phabricator at mercurial-scm.org
Fri Jan 17 16:53:54 UTC 2020


Alphare created this revision.
Herald added subscribers: mercurial-devel, kevincox.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  A lot of performance remains to be gained, most notably by doing more things
  in parallel, but also by caching, not falling back to Python but switching
  to another regex engine, etc..
  
  I have measured on multiple repositories that this change, when in combination
  with the next two patches, improve bare `hg status` performance, and has no
  observable impact when falling back (because it does so early).
  
  On the Netbeans repository:
  C: 840ms
  Rust+C: 556ms
  
  Mozilla Central with the one pattern that causes a fallback removed:
  C: 2.315s
  Rust+C: 1.700 s

REPOSITORY
  rHG Mercurial

BRANCH
  default

REVISION DETAIL
  https://phab.mercurial-scm.org/D7929

AFFECTED FILES
  rust/hg-core/src/dirstate/status.rs
  rust/hg-core/src/lib.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/lib.rs b/rust/hg-core/src/lib.rs
--- a/rust/hg-core/src/lib.rs
+++ b/rust/hg-core/src/lib.rs
@@ -13,7 +13,9 @@
     dirs_multiset::{DirsMultiset, DirsMultisetIter},
     dirstate_map::DirstateMap,
     parsers::{pack_dirstate, parse_dirstate, PARENT_SIZE},
-    status::{status, DirstateStatus},
+    status::{
+        status, BadMatch, BadType, DirstateStatus, StatusError, StatusOptions,
+    },
     CopyMap, CopyMapIter, DirstateEntry, DirstateParents, EntryState,
     StateMap, StateMapIter,
 };
diff --git a/rust/hg-core/src/dirstate/status.rs b/rust/hg-core/src/dirstate/status.rs
--- a/rust/hg-core/src/dirstate/status.rs
+++ b/rust/hg-core/src/dirstate/status.rs
@@ -11,22 +11,31 @@
 
 use crate::{
     dirstate::SIZE_FROM_OTHER_PARENT,
-    matchers::{Matcher, VisitChildrenSet},
+    filepatterns::PatternFileWarning,
+    matchers::{get_ignore_function, Matcher, VisitChildrenSet},
     utils::{
-        files::HgMetadata,
+        files::{find_dirs, HgMetadata},
         hg_path::{
             hg_path_to_path_buf, os_string_to_hg_path_buf, HgPath, HgPathBuf,
+            HgPathError,
         },
+        path_auditor::PathAuditor,
     },
     CopyMap, DirstateEntry, DirstateMap, EntryState, FastHashMap,
+    PatternError,
 };
+use lazy_static::lazy_static;
 use rayon::prelude::*;
-use std::borrow::Cow;
-use std::collections::{HashSet, VecDeque};
-use std::fs::{read_dir, DirEntry};
-use std::io::ErrorKind;
-use std::ops::Deref;
-use std::path::Path;
+use std::collections::VecDeque;
+use std::{
+    borrow::Cow,
+    collections::HashSet,
+    fs::{read_dir, DirEntry},
+    io::ErrorKind,
+    ops::Deref,
+    path::Path,
+    sync::mpsc,
+};
 
 #[derive(Debug)]
 pub enum BadType {
@@ -47,6 +56,7 @@
 /// Marker enum used to dispatch new status entries into the right collections.
 /// Is similar to `crate::EntryState`, but represents the transient state of
 /// entries during the lifetime of a command.
+#[derive(Debug)]
 enum Dispatch {
     Unsure,
     Modified,
@@ -145,7 +155,7 @@
             } else if options.list_clean {
                 Dispatch::Clean
             } else {
-                Dispatch::Unknown
+                Dispatch::None
             }
         }
         EntryState::Merged => Dispatch::Modified,
@@ -169,57 +179,91 @@
     }
 }
 
+lazy_static! {
+    static ref DEFAULT_WORK: HashSet<&'static HgPath> = {
+        let mut h = HashSet::new();
+        h.insert(HgPath::new(b""));
+        h
+    };
+}
+
 /// Get stat data about the files explicitly specified by match.
 /// TODO subrepos
 fn walk_explicit<'a>(
-    files: &'a HashSet<&HgPath>,
+    files: Option<&'a HashSet<&HgPath>>,
     dmap: &'a DirstateMap,
     root_dir: impl AsRef<Path> + Sync + Send,
+    work: mpsc::Sender<&'a HgPath>,
     options: StatusOptions,
-) -> impl ParallelIterator<Item = IoResult<(&'a HgPath, Dispatch)>> {
-    files.par_iter().filter_map(move |filename| {
-        // TODO normalization
-        let normalized = filename.as_ref();
+) -> impl ParallelIterator<Item = IoResult<(Cow<'a, HgPath>, Dispatch)>> {
+    files
+        .unwrap_or(&DEFAULT_WORK)
+        .par_iter()
+        .map_with(work, move |work, filename| {
+            // TODO normalization
+            let normalized = filename.as_ref();
 
-        let buf = match hg_path_to_path_buf(normalized) {
-            Ok(x) => x,
-            Err(e) => return Some(Err(e.into())),
-        };
-        let target = root_dir.as_ref().join(buf);
-        let st = target.symlink_metadata();
-        match st {
-            Ok(meta) => {
-                let file_type = meta.file_type();
-                if file_type.is_file() || file_type.is_symlink() {
-                    if let Some(entry) = dmap.get(normalized) {
+            let buf = match hg_path_to_path_buf(normalized) {
+                Ok(x) => x,
+                Err(e) => return Some(Err(e.into())),
+            };
+            let target = root_dir.as_ref().join(buf);
+            let st = target.symlink_metadata();
+            let in_dmap = dmap.get(normalized);
+            match st {
+                Ok(meta) => {
+                    let file_type = meta.file_type();
+                    if file_type.is_file() || file_type.is_symlink() {
+                        if let Some(entry) = in_dmap {
+                            return Some(Ok((
+                                Cow::Borrowed(normalized),
+                                dispatch_found(
+                                    &normalized,
+                                    *entry,
+                                    HgMetadata::from_metadata(meta),
+                                    &dmap.copy_map,
+                                    options,
+                                ),
+                            )));
+                        }
                         return Some(Ok((
-                            normalized,
-                            dispatch_found(
-                                &normalized,
-                                *entry,
-                                HgMetadata::from_metadata(meta),
-                                &dmap.copy_map,
-                                options,
-                            ),
+                            Cow::Borrowed(normalized),
+                            Dispatch::Unknown,
+                        )));
+                    } else {
+                        if file_type.is_dir() {
+                            // The channel always outlives the sender, unwrap
+                            work.send(normalized).unwrap()
+                        } else {
+                            return Some(Ok((
+                                Cow::Borrowed(normalized),
+                                Dispatch::Bad(BadMatch::BadType(
+                                    // TODO do more than unknown
+                                    BadType::Unknown,
+                                )),
+                            )));
+                        }
+                        if in_dmap.is_some() {
+                            return Some(Ok((
+                                Cow::Borrowed(normalized),
+                                Dispatch::Removed,
+                            )));
+                        }
+                    }
+                }
+                Err(_) => {
+                    if let Some(entry) = in_dmap {
+                        return Some(Ok((
+                            Cow::Borrowed(normalized),
+                            dispatch_missing(entry.state),
                         )));
                     }
-                } else {
-                    if dmap.contains_key(normalized) {
-                        return Some(Ok((normalized, Dispatch::Removed)));
-                    }
                 }
-            }
-            Err(_) => {
-                if let Some(entry) = dmap.get(normalized) {
-                    return Some(Ok((
-                        normalized,
-                        dispatch_missing(entry.state),
-                    )));
-                }
-            }
-        };
-        None
-    })
+            };
+            None
+        })
+        .filter(|s| s.is_some())
+        .map(|s| s.unwrap())
 }
 
 #[derive(Debug, Copy, Clone)]
@@ -391,13 +435,12 @@
     Ok(new_results)
 }
 
-/// Stat all entries in the `DirstateMap` and mark them for dispatch into
-/// the relevant collections.
+/// Stat all entries in the `DirstateMap` and mark them for dispatch.
 fn stat_dmap_entries(
     dmap: &DirstateMap,
     root_dir: impl AsRef<Path> + Sync + Send,
     options: StatusOptions,
-) -> impl ParallelIterator<Item = IoResult<(&HgPath, Dispatch)>> {
+) -> impl ParallelIterator<Item = IoResult<(Cow<HgPath>, Dispatch)>> {
     dmap.par_iter().map(move |(filename, entry)| {
         let filename: &HgPath = filename;
         let filename_as_path = hg_path_to_path_buf(filename)?;
@@ -408,10 +451,10 @@
                 if !(m.file_type().is_file()
                     || m.file_type().is_symlink()) =>
             {
-                Ok((filename, dispatch_missing(entry.state)))
+                Ok((Cow::Borrowed(filename), dispatch_missing(entry.state)))
             }
             Ok(m) => Ok((
-                filename,
+                Cow::Borrowed(filename),
                 dispatch_found(
                     filename,
                     *entry,
@@ -421,14 +464,14 @@
                 ),
             )),
             Err(ref e)
-                if e.kind() == std::io::ErrorKind::NotFound
+                if e.kind() == ErrorKind::NotFound
                     || e.raw_os_error() == Some(20) =>
             {
                 // Rust does not yet have an `ErrorKind` for
                 // `NotADirectory` (errno 20)
                 // It happens if the dirstate contains `foo/bar` and
                 // foo is not a directory
-                Ok((filename, dispatch_missing(entry.state)))
+                Ok((Cow::Borrowed(filename), dispatch_missing(entry.state)))
             }
             Err(e) => Err(e),
         }
@@ -436,21 +479,19 @@
 }
 
 pub struct DirstateStatus<'a> {
-    pub modified: Vec<&'a HgPath>,
-    pub added: Vec<&'a HgPath>,
-    pub removed: Vec<&'a HgPath>,
-    pub deleted: Vec<&'a HgPath>,
-    pub clean: Vec<&'a HgPath>,
-    pub ignored: Vec<&'a HgPath>,
-    pub unknown: Vec<&'a HgPath>,
-    pub bad: Vec<(&'a HgPath, BadMatch)>,
-    /* TODO ignored
-     * TODO unknown */
+    pub modified: Vec<Cow<'a, HgPath>>,
+    pub added: Vec<Cow<'a, HgPath>>,
+    pub removed: Vec<Cow<'a, HgPath>>,
+    pub deleted: Vec<Cow<'a, HgPath>>,
+    pub clean: Vec<Cow<'a, HgPath>>,
+    pub ignored: Vec<Cow<'a, HgPath>>,
+    pub unknown: Vec<Cow<'a, HgPath>>,
+    pub bad: Vec<(Cow<'a, HgPath>, BadMatch)>,
 }
 
 fn build_response<'a>(
-    results: impl IntoIterator<Item = IoResult<(&'a HgPath, Dispatch)>>,
-) -> IoResult<(Vec<&'a HgPath>, DirstateStatus<'a>)> {
+    results: impl IntoIterator<Item = (Cow<'a, HgPath>, Dispatch)>,
+) -> (Vec<Cow<'a, HgPath>>, DirstateStatus<'a>) {
     let mut lookup = vec![];
     let mut modified = vec![];
     let mut added = vec![];
@@ -461,8 +502,7 @@
     let mut unknown = vec![];
     let mut bad = vec![];
 
-    for res in results.into_iter() {
-        let (filename, dispatch) = res?;
+    for (filename, dispatch) in results.into_iter() {
         match dispatch {
             Dispatch::Unknown => unknown.push(filename),
             Dispatch::Unsure => lookup.push(filename),
@@ -477,7 +517,7 @@
         }
     }
 
-    Ok((
+    (
         lookup,
         DirstateStatus {
             modified,
@@ -489,25 +529,189 @@
             unknown,
             bad,
         },
-    ))
+    )
+}
+
+pub enum StatusError {
+    IO(std::io::Error),
+    Path(HgPathError),
+    Pattern(PatternError),
 }
 
+pub type StatusResult<T> = Result<T, StatusError>;
+
+impl From<PatternError> for StatusError {
+    fn from(e: PatternError) -> Self {
+        StatusError::Pattern(e)
+    }
+}
+impl From<HgPathError> for StatusError {
+    fn from(e: HgPathError) -> Self {
+        StatusError::Path(e)
+    }
+}
+impl From<std::io::Error> for StatusError {
+    fn from(e: std::io::Error) -> Self {
+        StatusError::IO(e)
+    }
+}
+
+impl ToString for StatusError {
+    fn to_string(&self) -> String {
+        match self {
+            StatusError::IO(e) => e.to_string(),
+            StatusError::Path(e) => e.to_string(),
+            StatusError::Pattern(e) => e.to_string(),
+        }
+    }
+}
+
+/// Get the status of files in the working directory.
+///
+/// This is the current entry-point for `hg-core` and is realistically unusable
+/// outside of a Python context because its arguments need to provide a lot of
+/// information that will not be necessary in the future.
 pub fn status<'a: 'c, 'b: 'c, 'c>(
     dmap: &'a DirstateMap,
-    matcher: &'b (impl Matcher),
+    matcher: &'b (impl Matcher + Sync),
     root_dir: impl AsRef<Path> + Sync + Send + Copy,
+    ignore_files: &[impl AsRef<Path>],
     options: StatusOptions,
-) -> IoResult<(Vec<&'c HgPath>, DirstateStatus<'c>)> {
+) -> StatusResult<(
+    (Vec<Cow<'c, HgPath>>, DirstateStatus<'c>),
+    Vec<PatternFileWarning>,
+)> {
+    let (ignore_fn, warnings) = get_ignore_function(&ignore_files, root_dir)?;
+    let dir_ignore_fn = |dir: &_| {
+        if ignore_fn(dir) {
+            true
+        } else {
+            for p in find_dirs(dir) {
+                if ignore_fn(p) {
+                    return true;
+                }
+            }
+            false
+        }
+    };
+
     let files = matcher.file_set();
+
     let mut results = vec![];
-    if let Some(files) = files {
-        results.par_extend(walk_explicit(&files, &dmap, root_dir, options));
+    let mut work = vec![];
+
+    // Step 1: check the files explicitly mentioned by the user
+    let (tx, rx) = mpsc::channel();
+    results.par_extend(walk_explicit(files, &dmap, root_dir, tx, options));
+    while let Ok(dir) = rx.recv() {
+        if options.list_ignored || options.list_unknown && !dir_ignore_fn(dir)
+        {
+            work.push(dir)
+        }
+    }
+
+    let mut results = results.into_iter().flatten().collect();
+
+    // Step 2: recursively check the working directory for changes if needed
+    for dir in work {
+        if dir_ignore_fn(dir) {
+            continue;
+        }
+        results = traverse(
+            matcher, root_dir, &dmap, dir, results, &ignore_fn, options,
+        )?;
     }
 
     if !matcher.is_exact() {
-        let stat_results = stat_dmap_entries(&dmap, root_dir, options);
-        results.par_extend(stat_results);
+        // Step 3: Check the remaining files from the dmap.
+        // If a dmap file is not in results yet, it was either
+        // a) not matched b) ignored, c) missing, or d) under a
+        // symlink directory.
+
+        let to_visit: Box<dyn Iterator<Item = (&HgPathBuf, &DirstateEntry)>> =
+            if results.is_empty() && matcher.matches_everything() {
+                Box::new(dmap.iter())
+            } else {
+                Box::new(dmap.iter().filter_map(|(f, e)| {
+                    if !results.contains_key(f.deref()) && matcher.matches(f) {
+                        Some((f, e))
+                    } else {
+                        None
+                    }
+                }))
+            };
+        let mut to_visit: Vec<_> = to_visit.collect();
+        to_visit.sort_by(|a, b| a.0.cmp(&b.0));
+
+        if options.list_unknown {
+            // We walked all dirs under the roots that weren't ignored, and
+            // everything that matched was stat'ed and is already in results.
+            // The rest must thus be ignored or under a symlink.
+            let mut path_auditor = PathAuditor::new(root_dir);
+
+            for (ref filename, entry) in to_visit {
+                // Report ignored items in the dmap as long as they are not
+                // under a symlink directory.
+                if path_auditor.check(filename) {
+                    // TODO normalize for other case-sensitive fs
+                    let buf = hg_path_to_path_buf(filename)?;
+                    results.insert(
+                        Cow::Borrowed(filename),
+                        match root_dir.as_ref().join(&buf).symlink_metadata() {
+                            // File was just ignored, no links, and exists
+                            Ok(meta) => {
+                                let metadata = HgMetadata::from_metadata(meta);
+                                dispatch_found(
+                                    filename,
+                                    *entry,
+                                    metadata,
+                                    &dmap.copy_map,
+                                    options,
+                                )
+                            }
+                            // File doesn't exist
+                            Err(_) => dispatch_missing(entry.state),
+                        },
+                    );
+                } else {
+                    // It's either missing or under a symlink directory which
+                    // we, in this case, report as missing.
+                    results.insert(
+                        Cow::Borrowed(filename),
+                        dispatch_missing(entry.state),
+                    );
+                }
+            }
+        } else {
+            // We may not have walked the full directory tree above, so stat
+            // and check everything we missed.
+            let stat_results = stat_dmap_entries(&dmap, root_dir, options);
+            results.par_extend(stat_results.flatten());
+        }
     }
 
-    build_response(results)
+    let results = results.into_iter().filter_map(|(filename, dispatch)| {
+        match dispatch {
+            Dispatch::Bad(_) => return Some((filename, dispatch)),
+            _ => {}
+        };
+        // TODO do this in //, not at the end
+        if !dmap.contains_key(filename.deref()) {
+            if (options.list_ignored || matcher.exact_match(&filename))
+                && dir_ignore_fn(&filename)
+            {
+                if options.list_ignored {
+                    return Some((filename.to_owned(), Dispatch::Ignored));
+                }
+            } else {
+                if !ignore_fn(&filename) {
+                    return Some((filename.to_owned(), Dispatch::Unknown));
+                }
+            }
+            return None;
+        }
+        Some((filename, dispatch))
+    });
+
+    Ok((build_response(results), warnings))
 }



To: Alphare, #hg-reviewers
Cc: kevincox, mercurial-devel


More information about the Mercurial-devel mailing list