D6348: rust-dirstate: add rust implementation of `parse_dirstate` and `pack_dirstate`
Alphare (Raphaël Gomès)
phabricator at mercurial-scm.org
Mon May 6 21:03:51 UTC 2019
Alphare created this revision.
Herald added subscribers: mercurial-devel, kevincox, durin42.
Herald added a reviewer: hg-reviewers.
REVISION SUMMARY
Working towards the goal of having a complete Rust implementation of
`hg status`, these two utils are a first step of many to be taken
to improve performance and code maintainability.
Two dependencies have been added: `memchr` and `byteorder`.
Both of them have been written by reputable community members and are
very mature crates.
The Rust code will often need to use their byte-oriented functions.
A few unit tests have been added and may help future development and debugging.
In a future patch that uses `parse_dirstate` to stat the working tree in
parallel - which neither the Python nor the C implementations do - actual
performance improvements will be seen for larger repositories.
REPOSITORY
rHG Mercurial
REVISION DETAIL
https://phab.mercurial-scm.org/D6348
AFFECTED FILES
rust/hg-core/Cargo.toml
rust/hg-core/src/dirstate.rs
rust/hg-core/src/lib.rs
CHANGE DETAILS
diff --git a/rust/hg-core/src/lib.rs b/rust/hg-core/src/lib.rs
--- a/rust/hg-core/src/lib.rs
+++ b/rust/hg-core/src/lib.rs
@@ -2,6 +2,9 @@
//
// This software may be used and distributed according to the terms of the
// GNU General Public License version 2 or any later version.
+extern crate byteorder;
+extern crate memchr;
+
mod ancestors;
pub mod dagops;
pub use ancestors::{AncestorsIterator, LazyAncestors, MissingAncestors};
@@ -40,3 +43,22 @@
ParentOutOfRange(Revision),
WorkingDirectoryUnsupported,
}
+
+#[derive(Clone, Debug, PartialEq)]
+pub enum DirstateParseError {
+ TooLittleData,
+ Overflow,
+}
+
+#[derive(Debug, PartialEq)]
+pub enum DirstatePackError {
+ CorruptedEntry(String),
+ CorruptedParent,
+ BadSize(usize, usize),
+}
+
+impl From<std::io::Error> for DirstatePackError {
+ fn from(e: std::io::Error) -> Self {
+ DirstatePackError::CorruptedEntry(e.to_string())
+ }
+}
diff --git a/rust/hg-core/src/dirstate.rs b/rust/hg-core/src/dirstate.rs
new file mode 100644
--- /dev/null
+++ b/rust/hg-core/src/dirstate.rs
@@ -0,0 +1,292 @@
+// Copyright 2019 Raphaël Gomès <rgomes at octobus.net>
+//
+// This software may be used and distributed according to the terms of the
+// GNU General Public License version 2 or any later version.
+
+use byteorder::{BigEndian, ByteOrder, WriteBytesExt};
+use std::collections::HashMap;
+use {DirstatePackError, DirstateParseError};
+
+pub type DirStateParents<'a> = ((&'a [u8], &'a [u8]));
+/// The C implementation uses all signed types. This will be an issue
+/// either when 4GB+ source files are commonplace or in 2038, whichever
+/// comes first.
+pub type DirstateTuple = (i8, i32, i32, i32);
+pub type DirstateVec = Vec<(Vec<u8>, DirstateTuple)>;
+pub type CopyMap<'a> = Vec<(&'a [u8], &'a [u8])>;
+
+/// Parents are stored in the dirstate as byte hashes.
+const PARENT_SIZE: usize = 20;
+/// Dirstate entries have a static part of 8 + 32 + 32 + 32 + 32 bits.
+const MIN_ENTRY_SIZE: usize = 17;
+
+pub fn parse_dirstate(
+ contents: &[u8],
+) -> Result<(DirStateParents, DirstateVec, CopyMap), DirstateParseError> {
+ if contents.len() < PARENT_SIZE {
+ return Err(DirstateParseError::TooLittleData);
+ }
+
+ let mut dirstate_vec = vec![];
+ let mut copies = vec![];
+ let parents = (
+ &contents[..PARENT_SIZE],
+ &contents[PARENT_SIZE..PARENT_SIZE * 2],
+ );
+ let mut i = PARENT_SIZE * 2;
+
+ while i < contents.len() {
+ if i + MIN_ENTRY_SIZE > contents.len() {
+ return Err(DirstateParseError::Overflow);
+ }
+
+ let slice = &contents[i..];
+ let state = slice[0] as i8;
+ let mode = BigEndian::read_i32(&slice[1..]);
+ let size = BigEndian::read_i32(&slice[5..]);
+ let mtime = BigEndian::read_i32(&slice[9..]);
+ let path_len = BigEndian::read_i32(&slice[13..]);
+
+ if path_len as usize > contents.len() - i {
+ return Err(DirstateParseError::Overflow);
+ }
+ let path =
+ &slice[MIN_ENTRY_SIZE..MIN_ENTRY_SIZE + (path_len as usize)];
+
+ let (path, copy) = match memchr::memchr(0, &path) {
+ None => (path, None),
+ Some(i) => (&path[..i], Some(&path[(i + 1)..])),
+ };
+
+ if let Some(copy_name) = copy {
+ copies.push((path, copy_name));
+ };
+ dirstate_vec.push((path.to_owned(), (state, mode, size, mtime)));
+ i = i + MIN_ENTRY_SIZE + (path_len as usize);
+ }
+
+ Ok((parents, dirstate_vec, copies))
+}
+
+pub fn pack_dirstate(
+ dirstate_vec: &DirstateVec,
+ copymap: &HashMap<Vec<u8>, Vec<u8>>,
+ parents: DirStateParents,
+ now: i32,
+) -> Result<(Vec<u8>, DirstateVec), DirstatePackError> {
+ if parents.0.len() != PARENT_SIZE || parents.1.len() != PARENT_SIZE {
+ return Err(DirstatePackError::CorruptedParent);
+ }
+
+ let expected_size: usize = dirstate_vec
+ .iter()
+ .map(|(ref filename, _)| {
+ let mut length = MIN_ENTRY_SIZE + filename.len();
+ if let Some(ref copy) = copymap.get(filename) {
+ length += copy.len() + 1;
+ }
+ length
+ })
+ .sum();
+ let expected_size = expected_size + PARENT_SIZE * 2;
+
+ let mut packed = Vec::with_capacity(expected_size);
+ let mut new_dirstate_vec = vec![];
+
+ packed.extend(parents.0);
+ packed.extend(parents.1);
+
+ for (ref filename, (state, mode, size, mtime)) in dirstate_vec {
+ let mut new_filename: Vec<u8> = filename.to_owned();
+ let mut new_mtime: i32 = *mtime;
+ if *state == 'n' as i8 && *mtime == now.into() {
+ // The file was last modified "simultaneously" with the current
+ // write to dirstate (i.e. within the same second for file-
+ // systems with a granularity of 1 sec). This commonly happens
+ // for at least a couple of files on 'update'.
+ // The user could change the file without changing its size
+ // within the same second. Invalidate the file's mtime in
+ // dirstate, forcing future 'status' calls to compare the
+ // contents of the file if the size is the same. This prevents
+ // mistakenly treating such files as clean.
+ new_mtime = -1;
+ new_dirstate_vec.push((
+ filename.to_owned(),
+ (*state, *mode, *size, new_mtime),
+ ));
+ }
+
+ if let Some(copy) = copymap.get(filename) {
+ new_filename.push('\0' as u8);
+ new_filename.extend(copy);
+ }
+
+ packed.write_i8(*state)?;
+ packed.write_i32::<BigEndian>(*mode)?;
+ packed.write_i32::<BigEndian>(*size)?;
+ packed.write_i32::<BigEndian>(new_mtime)?;
+ packed.write_i32::<BigEndian>(new_filename.len() as i32)?;
+ packed.extend(new_filename)
+ }
+
+ if packed.len() != expected_size {
+ return Err(DirstatePackError::BadSize(expected_size, packed.len()));
+ }
+
+ Ok((packed, new_dirstate_vec))
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_pack_dirstate_empty() {
+ let dirstate_vec: DirstateVec = vec![];
+ let copymap = HashMap::new();
+ let parents: DirStateParents =
+ (b"12345678910111213141", b"00000000000000000000");
+ let now: i32 = 15000000;
+ let expected =
+ (b"1234567891011121314100000000000000000000".to_vec(), vec![]);
+
+ assert_eq!(
+ expected,
+ pack_dirstate(&dirstate_vec, ©map, parents, now).unwrap()
+ );
+ }
+ #[test]
+ fn test_pack_dirstate_one_entry() {
+ let dirstate_vec: DirstateVec = vec![(
+ vec!['f' as u8, '1' as u8],
+ ('n' as i8, 0o644, 0, 791231220),
+ )];
+ let copymap = HashMap::new();
+ let parents: DirStateParents =
+ (b"12345678910111213141", b"00000000000000000000");
+ let now: i32 = 15000000;
+ let expected = (
+ [
+ 49, 50, 51, 52, 53, 54, 55, 56, 57, 49, 48, 49, 49, 49, 50,
+ 49, 51, 49, 52, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 110, 0, 0, 1, 164, 0,
+ 0, 0, 0, 47, 41, 58, 244, 0, 0, 0, 2, 102, 49,
+ ]
+ .to_vec(),
+ vec![],
+ );
+
+ assert_eq!(
+ expected,
+ pack_dirstate(&dirstate_vec, ©map, parents, now).unwrap()
+ );
+ }
+ #[test]
+ fn test_pack_dirstate_one_entry_with_copy() {
+ let dirstate_vec: DirstateVec =
+ vec![(b"f1".to_vec(), ('n' as i8, 0o644, 0, 791231220))];
+ let mut copymap = HashMap::new();
+ copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+ let parents: DirStateParents =
+ (b"12345678910111213141", b"00000000000000000000");
+ let now: i32 = 15000000;
+ let expected = (
+ [
+ 49, 50, 51, 52, 53, 54, 55, 56, 57, 49, 48, 49, 49, 49, 50,
+ 49, 51, 49, 52, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48,
+ 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 110, 0, 0, 1, 164, 0,
+ 0, 0, 0, 47, 41, 58, 244, 0, 0, 0, 11, 102, 49, 0, 99, 111,
+ 112, 121, 110, 97, 109, 101,
+ ]
+ .to_vec(),
+ vec![],
+ );
+
+ assert_eq!(
+ expected,
+ pack_dirstate(&dirstate_vec, ©map, parents, now).unwrap()
+ );
+ }
+
+ #[test]
+ fn test_parse_pack_one_entry_with_copy() {
+ let dirstate_vec: DirstateVec =
+ vec![(b"f1".to_vec(), ('n' as i8, 0o644, 0, 791231220))];
+ let mut copymap = HashMap::new();
+ copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+ let parents: DirStateParents =
+ (b"12345678910111213141", b"00000000000000000000");
+ let now: i32 = 15000000;
+ let result =
+ pack_dirstate(&dirstate_vec, ©map, parents, now).unwrap();
+
+ assert_eq!(
+ (
+ parents,
+ dirstate_vec,
+ copymap
+ .iter()
+ .map(|(k, v)| (k.as_slice(), v.as_slice()))
+ .collect()
+ ),
+ parse_dirstate(result.0.as_slice()).unwrap()
+ )
+ }
+
+ #[test]
+ fn test_parse_pack_multiple_entries_with_copy() {
+ let dirstate_vec: DirstateVec = vec![
+ (b"f1".to_vec(), ('n' as i8, 0o644, 123, 791231220)),
+ (b"f2".to_vec(), ('m' as i8, 0o777, 1000, 791231220)),
+ (b"f3".to_vec(), ('r' as i8, 0o644, 234553, 791231220)),
+ (b"f4\xF6".to_vec(), ('a' as i8, 0o644, -1, -1)),
+ ];
+ let mut copymap = HashMap::new();
+ copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+ copymap.insert(b"f4\xF6".to_vec(), b"copyname2".to_vec());
+ let parents: DirStateParents =
+ (b"12345678910111213141", b"00000000000000000000");
+ let now: i32 = 15000000;
+ let result =
+ pack_dirstate(&dirstate_vec, ©map, parents, now).unwrap();
+
+ assert_eq!(
+ (parents, dirstate_vec, copymap),
+ parse_dirstate(result.0.as_slice())
+ .and_then(|(p, dvec, cvec)| Ok((
+ p,
+ dvec,
+ cvec.iter()
+ .map(|(k, v)| (k.to_vec(), v.to_vec()))
+ .collect()
+ )))
+ .unwrap()
+ )
+ }
+
+ #[test]
+ /// https://www.mercurial-scm.org/repo/hg/rev/af3f26b6bba4
+ fn test_parse_pack_one_entry_with_copy_and_time_conflict() {
+ let dirstate_vec: DirstateVec =
+ vec![(b"f1".to_vec(), ('n' as i8, 0o644, 0, 15000000))];
+ let mut copymap = HashMap::new();
+ copymap.insert(b"f1".to_vec(), b"copyname".to_vec());
+ let parents: DirStateParents =
+ (b"12345678910111213141", b"00000000000000000000");
+ let now: i32 = 15000000;
+ let result =
+ pack_dirstate(&dirstate_vec, ©map, parents, now).unwrap();
+
+ assert_eq!(
+ (
+ parents,
+ vec![(b"f1".to_vec(), ('n' as i8, 0o644, 0, -1))],
+ copymap
+ .iter()
+ .map(|(k, v)| (k.as_slice(), v.as_slice()))
+ .collect()
+ ),
+ parse_dirstate(result.0.as_slice()).unwrap()
+ )
+ }
+}
diff --git a/rust/hg-core/Cargo.toml b/rust/hg-core/Cargo.toml
--- a/rust/hg-core/Cargo.toml
+++ b/rust/hg-core/Cargo.toml
@@ -10,3 +10,7 @@
[dev-dependencies]
rand = "*"
rand_pcg = "*"
+
+[dependencies]
+memchr = "2.2.0"
+byteorder = "1.3.1"
\ No newline at end of file
To: Alphare, #hg-reviewers
Cc: durin42, kevincox, mercurial-devel
More information about the Mercurial-devel
mailing list