D6348: rust-dirstate: add rust implementation of `parse_dirstate` and `pack_dirstate`

Alphare (Raphaël Gomès) phabricator at mercurial-scm.org
Mon May 6 17:03:51 EDT 2019


Alphare created this revision.
Herald added subscribers: mercurial-devel, kevincox, durin42.
Herald added a reviewer: hg-reviewers.

REVISION SUMMARY
  Working towards the goal of having a complete Rust implementation of
  `hg status`, these two utils are a first step of many to be taken
  to improve performance and code maintainability.
  
  Two dependencies have been added: `memchr` and `byteorder`.
  Both of them have been written by reputable community members and are
  very mature crates.
  The Rust code will often need to use their byte-oriented functions.
  
  A few unit tests have been added and may help future development and debugging.
  
  In a future patch that uses `parse_dirstate` to stat the working tree in
  parallel - which neither the Python nor the C implementations do - actual
  performance improvements will be seen for larger repositories.

REPOSITORY
  rHG Mercurial

REVISION DETAIL
  https://phab.mercurial-scm.org/D6348

AFFECTED FILES
  rust/hg-core/Cargo.toml
  rust/hg-core/src/dirstate.rs
  rust/hg-core/src/lib.rs

CHANGE DETAILS

diff --git a/rust/hg-core/src/lib.rs b/rust/hg-core/src/lib.rs
--- a/rust/hg-core/src/lib.rs
+++ b/rust/hg-core/src/lib.rs
@@ -2,6 +2,9 @@
 //
 // This software may be used and distributed according to the terms of the
 // GNU General Public License version 2 or any later version.
+extern crate byteorder;
+extern crate memchr;
+
 mod ancestors;
 pub mod dagops;
 pub use ancestors::{AncestorsIterator, LazyAncestors, MissingAncestors};
@@ -40,3 +43,22 @@
     ParentOutOfRange(Revision),
     WorkingDirectoryUnsupported,
 }
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum DirstateParseError {
+    TooLittleData,
+    Overflow,
+}
+
+#[derive(Debug, PartialEq)]
+pub enum DirstatePackError {
+    CorruptedEntry(String),
+    CorruptedParent,
+    BadSize(usize, usize),
+}
+
+impl From<std::io::Error> for DirstatePackError {
+    fn from(e: std::io::Error) -> Self {
+        DirstatePackError::CorruptedEntry(e.to_string())
+    }
+}
diff --git a/rust/hg-core/src/dirstate.rs b/rust/hg-core/src/dirstate.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/dirstate.rs
@@ -0,0 +1,292 @@
+// Copyright 2019 Raphaël Gomès <rgomes at octobus.net>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use byteorder::{BigEndian, ByteOrder, WriteBytesExt};
+use std::collections::HashMap;
+use {DirstatePackError, DirstateParseError};
+
+pub type DirStateParents<'a> = ((&'a [u8], &'a [u8]));
+/// The C implementation uses all signed types. This will be an issue
+/// either when 4GB+ source files are commonplace or in 2038, whichever
+/// comes first.
+pub type DirstateTuple = (i8, i32, i32, i32);
+pub type DirstateVec = Vec<(Vec<u8>, DirstateTuple)>;
+pub type CopyMap<'a> = Vec<(&'a [u8], &'a [u8])>;
+
+/// Parents are stored in the dirstate as byte hashes.
+const PARENT_SIZE: usize = 20;
+/// Dirstate entries have a static part of 8 + 32 + 32 + 32 + 32 bits.
+const MIN_ENTRY_SIZE: usize = 17;
+
+pub fn parse_dirstate(
+    contents: &[u8],
+) -> Result<(DirStateParents, DirstateVec, CopyMap), DirstateParseError> {
+    if contents.len() < PARENT_SIZE {
+        return Err(DirstateParseError::TooLittleData);
+    }
+
+    let mut dirstate_vec = vec![];
+    let mut copies = vec![];
+    let parents = (
+        &contents[..PARENT_SIZE],
+        &contents[PARENT_SIZE..PARENT_SIZE * 2],
+    );
+    let mut i = PARENT_SIZE * 2;
+
+    while i < contents.len() {
+        if i + MIN_ENTRY_SIZE > contents.len() {
+            return Err(DirstateParseError::Overflow);
+        }
+
+        let slice = &contents[i..];
+        let state = slice[0] as i8;
+        let mode = BigEndian::read_i32(&slice[1..]);
+        let size = BigEndian::read_i32(&slice[5..]);
+        let mtime = BigEndian::read_i32(&slice[9..]);
+        let path_len = BigEndian::read_i32(&slice[13..]);
+
+        if path_len as usize > contents.len() - i {
+            return Err(DirstateParseError::Overflow);
+        }
+        let path =
+            &slice[MIN_ENTRY_SIZE..MIN_ENTRY_SIZE + (path_len as usize)];
+
+        let (path, copy) = match memchr::memchr(0, &path) {
+            None => (path, None),
+            Some(i) => (&path[..i], Some(&path[(i + 1)..])),
+        };
+
+        if let Some(copy_name) = copy {
+            copies.push((path, copy_name));
+        };
+        dirstate_vec.push((path.to_owned(), (state, mode, size, mtime)));
+        i = i + MIN_ENTRY_SIZE + (path_len as usize);
+    }
+
+    Ok((parents, dirstate_vec, copies))
+}
+
+pub fn pack_dirstate(
+    dirstate_vec: &DirstateVec,
+    copymap: &HashMap<Vec<u8>, Vec<u8>>,
+    parents: DirStateParents,
+    now: i32,
+) -> Result<(Vec<u8>, DirstateVec), DirstatePackError> {
+    if parents.0.len() != PARENT_SIZE || parents.1.len() != PARENT_SIZE {
+        return Err(DirstatePackError::CorruptedParent);
+    }
+
+    let expected_size: usize = dirstate_vec
+        .iter()
+        .map(|(ref filename, _)| {
+            let mut length = MIN_ENTRY_SIZE + filename.len();
+            if let Some(ref copy) = copymap.get(filename) {
+                length += copy.len() + 1;
+            }
+            length
+        })
+        .sum();
+    let expected_size = expected_size + PARENT_SIZE * 2;
+
+    let mut packed = Vec::with_capacity(expected_size);
+    let mut new_dirstate_vec = vec![];
+
+    packed.extend(parents.0);
+    packed.extend(parents.1);
+
+    for (ref filename, (state, mode, size, mtime)) in dirstate_vec {
+        let mut new_filename: Vec<u8> = filename.to_owned();
+        let mut new_mtime: i32 = *mtime;
+        if *state == 'n' as i8 && *mtime == now.into() {
+            // The file was last modified "simultaneously" with the current
+            // write to dirstate (i.e. within the same second for file-
+            // systems with a granularity of 1 sec). This commonly happens
+            // for at least a couple of files on 'update'.
+            // The user could change the file without changing its size
+            // within the same second. Invalidate the file's mtime in
+            // dirstate, forcing future 'status' calls to compare the
+            // contents of the file if the size is the same. This prevents
+            // mistakenly treating such files as clean.
+            new_mtime = -1;
+            new_dirstate_vec.push((
+                filename.to_owned(),
+                (*state, *mode, *size, new_mtime),
+            ));
+        }
+
+        if let Some(copy) = copymap.get(filename) {
+            new_filename.push('\0' as u8);
+            new_filename.extend(copy);
+        }
+
+        packed.write_i8(*state)?;
+        packed.write_i32::<BigEndian>(*mode)?;
+        packed.write_i32::<BigEndian>(*size)?;
+        packed.write_i32::<BigEndian>(new_mtime)?;
+        packed.write_i32::<BigEndian>(new_filename.len() as i32)?;
+        packed.extend(new_filename)
+    }
+
+    if packed.len() != expected_size {
+        return Err(DirstatePackError::BadSize(expected_size, packed.len()));
+    }
+
+    Ok((packed, new_dirstate_vec))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_pack_dirstate_empty() {
+        let dirstate_vec: DirstateVec = vec![];
+        let copymap = HashMap::new();
+        let parents: DirStateParents =
+            (b"12345678910111213141", b"00000000000000000000");
+        let now: i32 = 15000000;
+        let expected =
+            (b"1234567891011121314100000000000000000000".to_vec(), vec![]);
+
+        assert_eq!(
+            expected,
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap()
+        );
+    }
+    #[test]
+    fn test_pack_dirstate_one_entry() {
+        let dirstate_vec: DirstateVec = vec![(
+            vec!['f' as u8, '1' as u8],
+            ('n' as i8, 0o644, 0, 791231220),
+        )];
+        let copymap = HashMap::new();
+        let parents: DirStateParents =
+            (b"12345678910111213141", b"00000000000000000000");
+        let now: i32 = 15000000;
+        let expected = (
+            [
+                49, 50, 51, 52, 53, 54, 55, 56, 57, 49, 48, 49, 49, 49, 50,
+                49, 51, 49, 52, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+                48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 110, 0, 0, 1, 164, 0,
+                0, 0, 0, 47, 41, 58, 244, 0, 0, 0, 2, 102, 49,
+            ]
+            .to_vec(),
+            vec![],
+        );
+
+        assert_eq!(
+            expected,
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap()
+        );
+    }
+    #[test]
+    fn test_pack_dirstate_one_entry_with_copy() {
+        let dirstate_vec: DirstateVec =
+            vec![(b"f1".to_vec(), ('n' as i8, 0o644, 0, 791231220))];
+        let mut copymap = HashMap::new();
+        copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+        let parents: DirStateParents =
+            (b"12345678910111213141", b"00000000000000000000");
+        let now: i32 = 15000000;
+        let expected = (
+            [
+                49, 50, 51, 52, 53, 54, 55, 56, 57, 49, 48, 49, 49, 49, 50,
+                49, 51, 49, 52, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+                48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 110, 0, 0, 1, 164, 0,
+                0, 0, 0, 47, 41, 58, 244, 0, 0, 0, 11, 102, 49, 0, 99, 111,
+                112, 121, 110, 97, 109, 101,
+            ]
+            .to_vec(),
+            vec![],
+        );
+
+        assert_eq!(
+            expected,
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap()
+        );
+    }
+
+    #[test]
+    fn test_parse_pack_one_entry_with_copy() {
+        let dirstate_vec: DirstateVec =
+            vec![(b"f1".to_vec(), ('n' as i8, 0o644, 0, 791231220))];
+        let mut copymap = HashMap::new();
+        copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+        let parents: DirStateParents =
+            (b"12345678910111213141", b"00000000000000000000");
+        let now: i32 = 15000000;
+        let result =
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap();
+
+        assert_eq!(
+            (
+                parents,
+                dirstate_vec,
+                copymap
+                    .iter()
+                    .map(|(k, v)| (k.as_slice(), v.as_slice()))
+                    .collect()
+            ),
+            parse_dirstate(result.0.as_slice()).unwrap()
+        )
+    }
+
+    #[test]
+    fn test_parse_pack_multiple_entries_with_copy() {
+        let dirstate_vec: DirstateVec = vec![
+            (b"f1".to_vec(), ('n' as i8, 0o644, 123, 791231220)),
+            (b"f2".to_vec(), ('m' as i8, 0o777, 1000, 791231220)),
+            (b"f3".to_vec(), ('r' as i8, 0o644, 234553, 791231220)),
+            (b"f4\xF6".to_vec(), ('a' as i8, 0o644, -1, -1)),
+        ];
+        let mut copymap = HashMap::new();
+        copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+        copymap.insert(b"f4\xF6".to_vec(), b"copyname2".to_vec());
+        let parents: DirStateParents =
+            (b"12345678910111213141", b"00000000000000000000");
+        let now: i32 = 15000000;
+        let result =
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap();
+
+        assert_eq!(
+            (parents, dirstate_vec, copymap),
+            parse_dirstate(result.0.as_slice())
+                .and_then(|(p, dvec, cvec)| Ok((
+                    p,
+                    dvec,
+                    cvec.iter()
+                        .map(|(k, v)| (k.to_vec(), v.to_vec()))
+                        .collect()
+                )))
+                .unwrap()
+        )
+    }
+
+    #[test]
+    /// https://www.mercurial-scm.org/repo/hg/rev/af3f26b6bba4
+    fn test_parse_pack_one_entry_with_copy_and_time_conflict() {
+        let dirstate_vec: DirstateVec =
+            vec![(b"f1".to_vec(), ('n' as i8, 0o644, 0, 15000000))];
+        let mut copymap = HashMap::new();
+        copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+        let parents: DirStateParents =
+            (b"12345678910111213141", b"00000000000000000000");
+        let now: i32 = 15000000;
+        let result =
+            pack_dirstate(&dirstate_vec, &copymap, parents, now).unwrap();
+
+        assert_eq!(
+            (
+                parents,
+                vec![(b"f1".to_vec(), ('n' as i8, 0o644, 0, -1))],
+                copymap
+                    .iter()
+                    .map(|(k, v)| (k.as_slice(), v.as_slice()))
+                    .collect()
+            ),
+            parse_dirstate(result.0.as_slice()).unwrap()
+        )
+    }
+}
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -10,3 +10,7 @@
 [dev-dependencies]
 rand = "*"
 rand_pcg = "*"
+
+[dependencies]
+memchr = "2.2.0"
+byteorder = "1.3.1"
\ No newline at end of file



To: Alphare, #hg-reviewers
Cc: durin42, kevincox, mercurial-devel


More information about the Mercurial-devel mailing list