speed up relink script

Fri Mar 23 14:04:14 CDT 2007

On Friday, 23 March 2007 at 13:47, TK Soh wrote:
> On 3/20/07, Matt Mackall <mpm at selenic.com> wrote:
> >On Tue, Mar 20, 2007 at 06:11:26AM -0500, TK Soh wrote:
> >> Coming to think about it again, perhaps reading from back to front
> >> isn't going to help much either. Apart from the fact that it's will be
> >> slow to read that way as pointed out by Bryan, most files in the repos
> >> are likely to stay unchanged over time. So it may actually slow down
> >> the comparison.
> >>
> >> I wonder if we can somehow compare the latest chunk of the index or
> >> data files checked in. Or, perhaps the last rev data in the index file
> >> will be representative? I'm not too confident on my understanding on
> >> the inner of hg to decide. Any input?
> >
> >Some notes:
> >
> >Two revlogs are identical if their indices are the same
> >Two revlogs don't match if they have different numbers of entries
> >Two revlogs are identical if their heads are the same.
> >Two revlogs may still be identical if their sizes are different, if
> >their last records are different, etc.
> >
> >The first observations says we can avoid ever reading .d files.
> >
> >This suggest the following approach:
> >
> >For each .i file in repo A:
> >   record size, MD5 hash, number of entries, and sorted list of heads
> >
> >For each .i file in repo B:
> >   if sizes match:
> >     if hashes match:
> >       relink files
> >   else:
> >     read index
> >     if counts don't match:
> >       continue
> >     find heads
> >     if heads match:
> >       relink files
> 
> Attached is the patch where I rewrote the script using the algo given
> by mpm above. The benchmark are:
> 
>    Crew rev:
>    Relinked 1350 files (361915752 bytes reclaimed)
>    7.45u 9.39s 0:38.60 43.6%
> 
>    patched rev:
>    Relinked 1350 files (361915752 bytes reclaimed)
>    4.69u 2.10s 0:13.23 51.3%
> 
> Please help review the patch. The only think I don't like that much
> are the repeating os.stat() calls, but I was try to keep the original
> layout of the code. Thanks.

It's better if collect skips .d files, and you don't do any reading at
all (eg the md5) until after the prune phase. I doubt the md5 is
needed - directly comparing the indexes is probably enough. I don't
think the head-comparing stuff is about speed - it's about possibly
relinking even more files.

> # HG changeset patch
> # User TK Soh <teekaysoh at yahoo.com>
> # Date 1174674962 18000
> # Node ID eef14cac989d39f759ca5bfb9717f23f886508eb
> # Parent  fe0fe0b4d73b32f9197f386140f2eb9363a13f7f
> optimize relink script with mpm's algorithm
> 
> diff -r fe0fe0b4d73b -r eef14cac989d contrib/hg-relink
> +++ b/contrib/hg-relink	Fri Mar 23 13:36:02 2007 -0500
> @@ -4,8 +4,40 @@
>  #
>  # This software may be used and distributed according to the terms
>  # of the GNU General Public License, incorporated herein by reference.
> +#
> +# The script uses the following algorithm to check for files to
> +# to relinked, as suggested by Matt Mackall:
> +# 
> +#   Two revlogs are identical if their indices are the same
> +#   Two revlogs don't match if they have different numbers of entries
> +#   Two revlogs are identical if their heads are the same.
> +#   Two revlogs may still be identical if their sizes are different, if
> +#   their last records are different, etc.
> +#   
> +#   The first observations says we can avoid ever reading .d files.
> +#   
> +#   This suggest the following approach:
> +#   
> +#   For each .i file in repo A:
> +#     record size, MD5 hash, number of entries, and sorted list of heads
> +#   
> +#   For each .i file in repo B:
> +#     if sizes match:
> +#       if hashes match:
> +#         relink files
> +#     else:
> +#       read index
> +#       if counts don't match:
> +#         continue
> +#       find heads
> +#       if heads match:
> +#         relink files
> +#
>  
> -import os, sys
> +import os, sys, md5
> +from mercurial import revlog, util
> +
> +CHUNKLEN = 65536
>  
>  class ConfigError(Exception): pass
>  
> @@ -30,22 +62,35 @@ except ConfigError, inst:
>      usage()
>      sys.exit(1)
>  
> +def md5hash(filename):
> +    md = md5.new()
> +    sfp = file(filename)
> +    sin = sfp.read(CHUNKLEN)
> +    while sin:
> +        md.update(sin)
> +        sin = sfp.read(CHUNKLEN)
> +    return md.hexdigest()
> +
>  def collect(src):
>      seplen = len(os.path.sep)
>      candidates = []
>      for dirpath, dirnames, filenames in os.walk(src):
>          relpath = dirpath[len(src) + seplen:]
>          for filename in filenames:
> -            if not (filename.endswith('.i') or filename.endswith('.d')):
> +            if not filename.endswith('.i'):
>                  continue
> +            mhash = md5hash(os.path.join(dirpath, filename))
> +            r = revlog.revlog(util.opener(os.getcwd(), 
> +                    audit=False), filename, "", 0)
>              st = os.stat(os.path.join(dirpath, filename))
> -            candidates.append((os.path.join(relpath, filename), st))
> +            candidates.append((os.path.join(relpath, filename), st,
> +                    mhash, r.count(), r.heads()))
>  
>      return candidates
>  
>  def prune(candidates, dst):
>      targets = []
> -    for fn, st in candidates:
> +    for fn, st, mh, ct, hd in candidates:
>          tgt = os.path.join(dst, fn)
>          try:
>              ts = os.stat(tgt)
> @@ -56,43 +101,50 @@ def prune(candidates, dst):
>              continue
>          if st.st_dev != ts.st_dev:
>              raise Exception('Source and destination are on different devices')
> -        if st.st_size != ts.st_size:
> -            continue
> -        targets.append((fn, ts.st_size))
> +
> +        if st.st_size == ts.st_size:
> +            mhash = md5hash(tgt)
> +            if mhash == mh:
> +                targets.append(fn)
> +        else:
> +            r = revlog.revlog(util.opener(os.getcwd(), 
> +                    audit=False), tgt, "", 0)
> +            if ct != r.count():
> +                continue
> +            heads = r.heads()
> +            heads.sort()
> +            hd.sort()
> +            if heads == hd:
> +                targets.append(fn)
>  
>      return targets
>  
>  def relink(src, dst, files):
> -    CHUNKLEN = 65536
>      relinked = 0
>      savedbytes = 0
>  
> -    for f, sz in files:
> -        source = os.path.join(src, f)
> -        tgt = os.path.join(dst, f)
> -        sfp = file(source)
> -        dfp = file(tgt)
> -        sin = sfp.read(CHUNKLEN)
> -        while sin:
> -            din = dfp.read(CHUNKLEN)
> -            if sin != din:
> -                break
> -            sin = sfp.read(CHUNKLEN)
> -        if sin:
> -            continue
> -        try:
> -            os.rename(tgt, tgt + '.bak')
> +    for ifile in files:
> +        dfile = ifile[:-1] + "d"
> +        for f in (ifile, dfile):
> +            source = os.path.join(src, f)
> +            tgt = os.path.join(dst, f)
> +            if not (os.access(tgt, os.F_OK) and os.access(source, os.F_OK)):
> +                continue
> +                
>              try:
> -                os.link(source, tgt)
> -            except OSError:
> -                os.rename(tgt + '.bak', tgt)
> -                raise
> -            print 'Relinked %s' % f
> -            relinked += 1
> -            savedbytes += sz
> -            os.remove(tgt + '.bak')
> -        except OSError, inst:
> -            print '%s: %s' % (tgt, str(inst))
> +                ts = os.stat(tgt)
> +                os.rename(tgt, tgt + '.bak')
> +                try:
> +                    os.link(source, tgt)
> +                except OSError:
> +                    os.rename(tgt + '.bak', tgt)
> +                    raise
> +                print 'Relinked %s' % f
> +                relinked += 1
> +                savedbytes += ts.st_size
> +                os.remove(tgt + '.bak')
> +            except OSError, inst:
> +                print '%s: %s' % (tgt, str(inst))
>  
>      print 'Relinked %d files (%d bytes reclaimed)' % (relinked, savedbytes)
>  

> _______________________________________________
> Mercurial mailing list
> Mercurial at selenic.com
> http://selenic.com/mailman/listinfo/mercurial