[PATCH] contrib/synthwiderepo: generate random repos with arbitrarily many files
adgar at google.com
adgar at google.com
Mon Aug 25 16:27:18 CDT 2014
# HG changeset patch
# User Mike Edgar <adgar at google.com>
# Date 1408738363 14400
# Fri Aug 22 16:12:43 2014 -0400
# Node ID 4e11710e8d2fdaf35858a02f310898294c73ef2a
# Parent de783f2403c498ef1e20121acf178b32ec27199c
contrib/synthwiderepo: generate random repos with arbitrarily many files
Two new commands are introduced for the purpose of evaluating manifest
performance at a variety of scaling points.
The first command, analyze-wide, walks a directory tree and emits a model
of the directory structure (currently just file counts per directory).
The second command, synthesize-wide, constructs a single commit with N
files added in a directory structure statistically similar to the
distribution of files in a model created by analyze-wide.
NB: It seems possible to combine this with the existing analyze/
synthesize commands in contrib/synthrepo.py, but the options required
by the -wide variants are different enough that a merger felt kludgy.
diff -r de783f2403c4 -r 4e11710e8d2f contrib/synthwiderepo.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/contrib/synthwiderepo.py Fri Aug 22 16:12:43 2014 -0400
@@ -0,0 +1,184 @@
+# synthwiderepo.py - wide repo synthesis
+#
+# This software may be used and distributed according to the terms of the
+# GNU General Public License version 2 or any later version.
+
+'''synthesize a commit with many files in many directories'''
+
+import bisect, json, itertools, os, random, sys
+from mercurial import cmdutil, context, util, hg
+from mercurial.i18n import _
+from mercurial.node import hex, nullid, short
+
+cmdtable = {}
+command = cmdutil.command(cmdtable)
+
+ at command('analyze-wide',
+ [('o', 'output', '', _('write output to given file'), _('FILE')),
+ ('r', 'root', '', _('alternate directory to analyze'), _('DIR'))],
+ _('hg analyze-wide'),
+ optionalrepo=True)
+def analyze_wide(ui, repo, root='', output='', **opts):
+ '''measure file count in each directory in a tree and write counts to a file
+
+ Both repositories and arbitrary directories may be analyzed. By default, the
+ current repository will be analyzed. To override the default (or no repository
+ is present) specify --root.
+
+ The file counts are written to a JSON file which can be used with
+ :hg:`synthesize-wide` to create a commit with synthesized files similar to the
+ analyzed directory.
+ '''
+ if repo and not root:
+ root = repo.root
+ if not output:
+ output = os.path.basename(root) + '.json'
+
+ if output == '-':
+ fp = sys.stdout
+ else:
+ fp = open(output, 'w')
+
+ def onerror(e):
+ ui.warn('os.walk error: %s\n' % e)
+
+ outjson = {'dirs': {}}
+ rootprefixlen = len(root)
+ try:
+ for dirpath, dirnames, filenames in os.walk(root, onerror=onerror):
+ dirpathfromroot = dirpath[rootprefixlen:]
+ outjson['dirs'][dirpathfromroot] = len(filenames)
+ if '.hg' in dirnames:
+ dirnames.remove('.hg')
+
+ json.dump(outjson, fp)
+ finally:
+ fp.close()
+
+ at command('synthesize-wide',
+ [('c', 'count', 0, _('create given number of files'), _('COUNT')),
+ ('', 'dict', '', _('path to a dictionary of words'), _('FILE')),
+ ('', 'seed', 0, _('seed for random number generator'))],
+ _('hg synthesize-wide [OPTION].. DESCFILE'))
+def synthesize_wide(ui, repo, descpath, count=0, seed=0, **opts):
+ '''synthesize a commit adding files using a model of directory structure
+
+ The model must have been generated by :hg:`analyze-wide`. The directories in
+ the model will be renamed randomly, and files will be created in the renamed
+ directories according to the probabilities in the model.
+
+ Generated file path components are drawn randomly from a dictionary
+ containing one word per line. Use --dict to specify a specific dictionary.
+ '''
+ if not count:
+ raise util.Abort(_('the number of files to create is required'))
+ if seed:
+ random.seed(int(seed))
+
+ try:
+ fp = hg.openpath(ui, descpath)
+ except Exception, err:
+ raise util.Abort('%s: %s' % (descpath, err[0].strerror))
+ desc = json.load(fp)
+ fp.close()
+
+ newdirs = renamedirs(desc['dirs'], words)
+ dirs = cdf(newdirs)
+
+ dictfile = opts.get('dict') or '/usr/share/dict/words'
+ try:
+ fp = open(dictfile, 'rU')
+ except IOError, err:
+ raise util.Abort('%s: %s' % (dictfile, err.strerror))
+ words = [w.lower().replace("'", "") for w in fp.read().splitlines()]
+ fp.close()
+
+ random.shuffle(words)
+ # Size of dictionary is typically < 1M, so we pick filenames by iterating
+ # through each pair of words from a shuffled word list.
+ filenames = itertools.imap(lambda parts: '-'.join(reversed(parts)),
+ itertools.product(words, words))
+
+ _synthesizing = _('synthesizing')
+ _files = _('files')
+
+ wlock = repo.wlock()
+ lock = repo.lock()
+
+ try:
+ pctx = repo['tip']
+
+ files = {}
+ for i in xrange(0, count):
+ ui.progress(_synthesizing, i, unit=_files, total=count)
+
+ # Start off with a flat repo.
+ path = pickpath(dirs, filenames)
+ while path in files:
+ path = pickpath(dirs, filenames)
+ data = '%s contents\n' % path
+ files[path] = context.memfilectx(repo, path, data)
+
+ def filectxfn(repo, memctx, path):
+ if path not in files:
+ raise IOError('Unexpected file added in synthesize-wide: %s' % path)
+ return files[path]
+
+ ui.progress(_synthesizing, None)
+ ui.status(_('committing %d synthesized files\n') % (len(files),))
+ message = 'synthesized wide repo with %d files' % (len(files),)
+ user = ui.username()
+ date = '%d %d' % util.makedate()
+ mc = context.memctx(repo, [pctx.node(), nullid], message,
+ files.iterkeys(),
+ filectxfn, user, date)
+ newnode = mc.commit()
+
+ hexfn = ui.debugflag and hex or short
+ newnodehex = hexfn(newnode)
+
+ ui.write(_('added commit %s\n') % (newnodehex,))
+ finally:
+ lock.release()
+ wlock.release()
+
+def renamedirs(dirs, words):
+ '''Randomly rename the directory names in the per-dir file count dict.'''
+ wordgen = itertools.cycle(words)
+ replacements = {'': ''}
+ result = {}
+ def rename(dirpath):
+ '''Recursively rename the directory and all path prefixes.
+
+ The mapping from path to renamed path is stored for all path prefixes
+ as in dynamic programming, ensuring linear runtime and consistent renaming
+ regardless of iteration order through the model.'''
+ if dirpath in replacements:
+ return replacements[dirpath]
+ head, _ = os.path.split(dirpath)
+ head = head and rename(head) or ''
+ renamed = os.path.join(head, next(wordgen))
+ replacements[dirpath] = renamed
+ return renamed
+ for dirpath, count in dirs.iteritems():
+ result[rename(dirpath.lstrip(os.sep))] = count
+ return result
+
+# TODO(adgar): Extract common synthesis module, used here and in synthrepo.py.
+def cdf(l):
+ if not l:
+ return [], []
+ vals, probs = zip(*sorted(l.iteritems(), key=lambda x: x[1], reverse=True))
+ t = float(sum(probs, 0))
+ s, cdfs = 0, []
+ for v in probs:
+ s += v
+ cdfs.append(s / t)
+ return vals, cdfs
+
+def pickcdf(cdf):
+ return cdf[0][bisect.bisect_left(cdf[1], random.random())]
+
+def pickpath(dirs, filenames):
+ dirname = pickcdf(dirs).encode('utf-8')
+ return os.path.join(dirname, next(filenames))
More information about the Mercurial-devel
mailing list