[PATCH 3 of 4] contrib/synthrepo: generate initial repo contents using directory shape model

Sat Sep 13 11:55:43 CDT 2014

# HG changeset patch
# User Mike Edgar <adgar at google.com>
# Date 1410573869 14400
#      Fri Sep 12 22:04:29 2014 -0400
# Node ID d504ccbc6ea5e84d1eb99b8a328309974dff80ce
# Parent  5edfc490cd4067d0b7b0e5d0ba878d51a3092aae
contrib/synthrepo: generate initial repo contents using directory shape model

Augments the synthesize command to use an additional parameter to the analyzed
repo model: the number of files in each directory at a given snapshot. Before
synthesizing history, an arbitrary number of files will be generated in a
distribution matching the analyzed directory structure.

Intended for developing, testing and measuring scaling improvements when
importing/converting a large repository to Mercurial.

diff -r 5edfc490cd40 -r d504ccbc6ea5 contrib/synthrepo.py

--- a/contrib/synthrepo.py	Fri Sep 12 21:38:52 2014 -0400
+++ b/contrib/synthrepo.py	Fri Sep 12 22:04:29 2014 -0400
@@ -35,10 +35,10 @@
 - Symlinks and binary files are ignored
 '''
 
-import bisect, collections, json, os, random, time, sys
+import bisect, collections, itertools, json, os, random, time, sys
 from mercurial import cmdutil, context, patch, scmutil, util, hg
 from mercurial.i18n import _
-from mercurial.node import nullrev, nullid
+from mercurial.node import nullrev, nullid, short
 
 testedwith = 'internal'
 
@@ -208,14 +208,17 @@
 
 @command('synthesize',
          [('c', 'count', 0, _('create given number of commits'), _('COUNT')),
-          ('', 'dict', '', _('path to a dictionary of words'), _('FILE'))],
+          ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
+          ('', 'initfiles', 0, _('initial file count to create'), _('COUNT'))],
          _('hg synthesize [OPTION].. DESCFILE'))
 def synthesize(ui, repo, descpath, **opts):
     '''synthesize commits based on a model of an existing repository
 
     The model must have been generated by :hg:`analyze`. Commits will
     be generated randomly according to the probabilities described in
-    the model.
+    the model. If --initfiles is set, the repository will be seeded with
+    the given number files following the modeled repository's directory
+    structure.
 
     When synthesizing new content, commit descriptions, and user
     names, words will be chosen randomly from a dictionary that is
@@ -261,9 +264,19 @@
     words = fp.read().splitlines()
     fp.close()
 
+    initdirs = {}
+    if desc['initdirs']:
+        for k, v in desc['initdirs']:
+            initdirs[k.encode('utf-8').replace('.hg', '_hg')] = v
+        initdirs = renamedirs(initdirs, words)
+    initdirscdf = cdf(initdirs)
+
     def pick(cdf):
         return cdf[0][bisect.bisect_left(cdf[1], random.random())]
 
+    def pickpath():
+        return os.path.join(pick(initdirscdf), random.choice(words))
+
     def makeline(minimum=0):
         total = max(minimum, pick(linelengths))
         c, l = 0, []
@@ -280,8 +293,38 @@
 
     progress = ui.progress
     _synthesizing = _('synthesizing')
+    _files = _('initial files')
     _changesets = _('changesets')
 
+    # Synthesize a single initial revision adding files to the repo according
+    # to the modeled directory structure.
+    initcount = int(opts['initfiles'])
+    if initcount and initdirs:
+        pctx = repo[None].parents()[0]
+        files = {}
+        for i in xrange(0, initcount):
+            ui.progress(_synthesizing, i, unit=_files, total=initcount)
+
+            path = pickpath()
+            while path in pctx.dirs():
+                path = pickpath()
+            data = '%s contents\n' % path
+            files[path] = context.memfilectx(repo, path, data)
+
+        def filectxfn(repo, memctx, path):
+            return files[path]
+
+        ui.progress(_synthesizing, None)
+        message = 'synthesized wide repo with %d files' % (len(files),)
+        mc = context.memctx(repo, [pctx.node(), nullid], message,
+                            files.iterkeys(), filectxfn, ui.username(),
+                            '%d %d' % util.makedate())
+        initnode = mc.commit()
+        hexfn = ui.debugflag and hex or short
+        ui.status(_('added commit %s with %d files\n')
+                  % (hexfn(initnode), len(files)))
+
+    # Synthesize incremental revisions to the repository, adding repo depth.
     count = int(opts['count'])
     heads = set(map(repo.changelog.rev, repo.heads()))
     for i in xrange(count):
@@ -377,3 +420,26 @@
 
     lock.release()
     wlock.release()
+
+def renamedirs(dirs, words):
+    '''Randomly rename the directory names in the per-dir file count dict.'''
+    wordgen = itertools.cycle(words)
+    replacements = {'': ''}
+    def rename(dirpath):
+        '''Recursively rename the directory and all path prefixes.
+
+        The mapping from path to renamed path is stored for all path prefixes
+        as in dynamic programming, ensuring linear runtime and consistent
+        renaming regardless of iteration order through the model.
+        '''
+        if dirpath in replacements:
+            return replacements[dirpath]
+        head, _ = os.path.split(dirpath)
+        head = head and rename(head) or ''
+        renamed = os.path.join(head, wordgen.next())
+        replacements[dirpath] = renamed
+        return renamed
+    result = []
+    for dirpath, count in dirs.iteritems():
+        result.append([rename(dirpath.lstrip(os.sep)), count])
+    return result